In [1]:
merge_functional = False # Whether to merge Functional comments into one class
n_gram = 1  # here we chose [1, 2, 3] for [unigrams, bigrams, trigrams]
l_penalty = 'l2' # we can choose between l1 and l2 for logistic regression and SVM
param_min_df = 1
param_max_df = 1.0
use_stemmer = True
use_tf_idf = True
use_idf = True
remove_special_chars = True
use_lowercase = True

In [2]:
import pandas as pd 
import numpy as np
import re
from time import time

t_start = time()

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
pd.set_option('display.max_rows', 10)

## Ucitavanje svih podataka

In [5]:
def create_df():
    dfs = pd.DataFrame()
    for id_ in range(1,5):
        try:
            cols = ['NaturalLanguageID', 'ProgrammingLanguageName', 'RepoID', 'SourceID', 'CommentID', 'comment', 'label']
            df = pd.read_csv('.\\AnnotatedData\\OutData{}.txt'.format(id_), sep='\t', lineterminator='\r', header=None)
            df.columns = cols
            if id_ == 1:
                dfs = df
            else:
                dfs = pd.concat([dfs,df], ignore_index=True)
        
        except:
            with open('.\\AnnotatedData\\OutData{}.txt'.format(id_), "r", encoding="utf-8",  newline="\r\n") as source:
                for line in source:
                    line = line.replace("\n", "")
                    part = line.split("\t")
                    if len(part) != 7:
                        print('POPRAVITI u fajlu: ', id_)
                        print(part)
    return dfs

In [6]:
df = create_df()

In [7]:
for i in range(df.shape[0]):
    if df['NaturalLanguageID'][i][0] == '\n':
        df['NaturalLanguageID'][i] = df['NaturalLanguageID'][i][1:]

In [8]:
df

Unnamed: 0,NaturalLanguageID,ProgrammingLanguageName,RepoID,SourceID,CommentID,comment,label
0,SR,Java,BILD-IT-Advanced-master,src\grupniProjekat_02_03_2016\SearchEngine.java,BILD-IT-Advanced-master/src\grupniProjekat_02_...,Ispisujemo poruku da je drajver loadovan,Functional-Inline
1,SR,Java,BILD-IT-Advanced-master,src\grupniProjekat_02_03_2016\SearchEngine.java,BILD-IT-Advanced-master/src\grupniProjekat_02_...,Ispis država za uneseni kontinent,Functional-Inline
2,SR,Java,BILD-IT-Advanced-master,src\grupniProjekat_02_03_2016\SearchEngine.java,BILD-IT-Advanced-master/src\grupniProjekat_02_...,Slanje upute,Functional-Inline
3,SR,Java,BILD-IT-Advanced-master,src\grupniProjekat_02_03_2016\SearchEngine.java,BILD-IT-Advanced-master/src\grupniProjekat_02_...,Zatvaranje konekcije sa bazom podataka,Functional-Inline
4,SR,Java,BILD-IT-Advanced-master,src\grupniProjekat_HotelManagement\Hotel.java,BILD-IT-Advanced-master/src\grupniProjekat_Hot...,vraca program na pocetak,Functional-Inline
...,...,...,...,...,...,...,...
3371,EN/SR,Java,SCStemmers-master,src\weka\core\stemmers\SCStemmer.java,SCStemmers-master/src\weka\core\stemmers\SCSte...,* Osnovna apstraktna klasa za funkcije zajed...,Functional-Module
3372,SR,Java,SCStemmers-master,src\weka\core\stemmers\SCStemmer.java,SCStemmers-master/src\weka\core\stemmers\SCSte...,* Inicijalizuje pravila za stemovanje * <p>...,Functional-Method
3373,SR,Java,SCStemmers-master,src\weka\core\stemmers\SerbianStemmer.java,SCStemmers-master/src\weka\core\stemmers\Serbi...,* Spisak sufiksnih pravila * <p> * <i>The ...,Functional-Inline
3374,SR,Java,SCStemmers-master,src\weka\core\stemmers\SerbianStemmer.java,SCStemmers-master/src\weka\core\stemmers\Serbi...,* Stemuje reč koja je napisana u dual1 kodir...,Functional-Method


In [9]:
# This is for the case we want to use Functional for all comments

if merge_functional:
    for i in range(df.shape[0]):
        df.loc[i]['label'] = re.sub('-Inline', '', df['label'][i])
        df.loc[i]['label'] = re.sub('-Method', '', df['label'][i])
        df.loc[i]['label'] = re.sub('-Module', '', df['label'][i])

Preprocessing all comments

In [10]:
for i in range(df.shape[0]):
    # pretvaranje celog teksta u mala slova
    if use_lowercase:
        df.loc[i]['comment'] = df.loc[i]['comment'].lower()
    # izbacivanje special character-a
    if remove_special_chars:
        df['comment'][i] = re.sub(r'\W', ' ', df['comment'][i])
        # izbacivanje new line oznake
        df['comment'][i] = re.sub(r'\b[n]\b', '', df['comment'][i])
        # zamena vise razmaka s jednim razmakom
        df['comment'][i] = re.sub(r'\s+', ' ', df['comment'][i], flags=re.I)

In [11]:
for i in range(10):
    print(df['comment'][i])

 ispisujemo poruku da je drajver loadovan 
 ispis država za uneseni kontinent 
 slanje upute 
 zatvaranje konekcije sa bazom podataka 
 vraca program na pocetak 
 paneli za tabove u prozoru admin 
 button za pregled racuna za datog korisnika 
 button za spasavanje update a gosta 
 panel sa boxom i poljem za unos nove sobe na zahtjev gosta 
 polje za unos novog broja sobe 


## ML 

In [12]:
df.label.value_counts()

Functional-Inline    1946
Functional-Method     967
Functional-Module     315
Code                   70
General                35
ToDo                   16
Notice                 16
IDE                    11
Name: label, dtype: int64

In [13]:
x = df['comment'].values
y = df['label'].values

In [14]:
# Pokrenuti ovo za koriscenje stemmovanja

import nltk
nltk.download('punkt')

import StemmerByNikola
from StemmerByNikola import stem_str

if use_stemmer:
    stemmed_x = []
    for doc in x:
        stemmed_x.append(stem_str(doc))
    
    x = np.array(stemmed_x)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
['jovi', 'jesam', 'isx', 'u', 'sxkol', '.', 'marij', 'jesam', 'dobr', 'devoj', '.']
 jovi jesam isx u sxkol . marij jesam dobr devoj .


In [15]:
from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits=10, random_state=151, shuffle=True)
kf.get_n_splits(x)

10

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score# Deo gde se radi obicna vektorizacija bez TF-IDF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

In [17]:
def create_pipeline(model='log_reg', C=1):
    steps = []
    steps.append(("vect", CountVectorizer(analyzer='word', lowercase=use_lowercase, # Menjanjem ovoga testiramo lowercase uticaj
                         ngram_range=(1, n_gram), # Podesavanjem ovoga testiramo bigramski i trigramski uticaj
                         min_df=param_min_df, max_df=param_max_df))) # Podesavanjem ova dva testiracemo uticaj frekvencijskog filtriranja
    if use_tf_idf:
        steps.append(("tfidf", TfidfTransformer(use_idf=use_idf)))
        
    if model == 'log_reg':
        steps.append(("classifier", LogisticRegression(penalty=l_penalty, C=C)))
    elif model == 'bayes':
        steps.append(("classifier", MultinomialNB()))
    elif model == 'bernoulli':
        steps.append(("classifier", BernoulliNB()))
    elif model == 'svm':
        steps.append(("classifier", LinearSVC(penalty=l_penalty, C=C, dual=True)))
        
    return Pipeline(steps)

In [18]:
def cross_validation(x, y, model = 'log_reg'):
    pipeline = create_pipeline(model)

    accuracies = []
    f1_scores = []

    for train_index, test_index in kf.split(x, y):
        X_train, X_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)

        accuracies.append(accuracy_score(y_test, y_pred))
        f1_scores.append(f1_score(y_test, y_pred, average='macro'))

    print(model)
    print("Accuracy:", sum(accuracies)/len(accuracies))
    print("F1 score:", sum(f1_scores)/len(f1_scores))

In [19]:
def nested_cross_validation(x, y, model='log_reg'):
    accuracies = []
    f1_scores = []
    for train_index, test_index in kf.split(x, y):
        X_train, X_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        parameters = {'C':[0.1, 1, 10, 100, 1000]}

        kf_inner = StratifiedKFold(n_splits=5, random_state=151, shuffle=True)
        kf_inner.get_n_splits(X_train)

        acc_average = []
        f1_average = []
        for c in parameters['C']:
            accuracies_inner = []
            f1_scores_inner = []
            for train_index_inner, test_index_inner in kf_inner.split(X_train, y_train):
                X_train_inner, X_test_inner = X_train[train_index_inner], X_train[test_index_inner]
                y_train_inner, y_test_inner = y_train[train_index_inner], y_train[test_index_inner]

                pipeline_inner = create_pipeline(model, C=c)
                pipeline_inner.fit(X_train_inner, y_train_inner)
                y_pred = pipeline_inner.predict(X_test_inner)

                accuracies_inner.append(accuracy_score(y_test_inner, y_pred))
                f1_scores_inner.append(f1_score(y_test_inner, y_pred, average='macro'))

            acc_average.append(np.mean(accuracies_inner))
            f1_average.append(np.mean(f1_scores_inner))


        pipeline = create_pipeline(model, C=parameters['C'][np.argmax(f1_average)])
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        accuracies.append(accuracy_score(y_test, y_pred))
        f1_scores.append(f1_score(y_test, y_pred, average='macro'))
        print('Accuracy for C = {}: {}'.format(parameters['C'][np.argmax(f1_average)], accuracy_score(y_test, y_pred)))
        print('F1 score for C = {}: {}'.format(parameters['C'][np.argmax(f1_average)], f1_score(y_test, y_pred, average='macro')))
        print('\n')
        
    print(model)
    print("Mean accuracy: ", np.mean(accuracies))
    print("Mean f1 score: ", np.mean(f1_scores))

# Logistic regression L penalty

In [20]:
# cross_validation(x, y, 'log_reg')

## Logistic regression - search for finding the optimal C value

In [21]:
nested_cross_validation(x, y)

Accuracy for C = 1000: 0.8005865102639296
F1 score for C = 1000: 0.6140651942783278


Accuracy for C = 1000: 0.8411764705882353
F1 score for C = 1000: 0.7423640670254805


Accuracy for C = 1000: 0.8235294117647058
F1 score for C = 1000: 0.6923572308504553


Accuracy for C = 1000: 0.8764705882352941
F1 score for C = 1000: 0.616578555764411


Accuracy for C = 1000: 0.8264705882352941
F1 score for C = 1000: 0.468598982806258


Accuracy for C = 1000: 0.8402366863905325
F1 score for C = 1000: 0.5544947318007662


Accuracy for C = 100: 0.8208955223880597
F1 score for C = 100: 0.39894686544995267


Accuracy for C = 1000: 0.8952095808383234
F1 score for C = 1000: 0.8121483196623241


Accuracy for C = 1000: 0.8323353293413174
F1 score for C = 1000: 0.4599887612387612


Accuracy for C = 100: 0.8562874251497006
F1 score for C = 100: 0.5110640457445452


log_reg
Mean accuracy:  0.8413198113195393
Mean f1 score:  0.5870606754621283


## Multinomial Naive Bayes Classifier

In [22]:
cross_validation(x, y, 'bayes')

bayes
Accuracy: 0.7693874402481814
F1 score: 0.3050871900096982


## Bernoulli Naive Bayes Classifier

In [23]:
cross_validation(x, y, 'bernoulli')

bernoulli
Accuracy: 0.7735341400690021
F1 score: 0.2685805269558231


## Support Vector Machine

In [24]:
# cross_validation(x, y, 'svm')

## Support Vector Machine - search for finding the optimal C value

In [25]:
nested_cross_validation(x, y, 'svm')

Accuracy for C = 1: 0.8269794721407625
F1 score for C = 1: 0.6063593426220604


Accuracy for C = 10: 0.85
F1 score for C = 10: 0.7380212656332059


Accuracy for C = 1: 0.85
F1 score for C = 1: 0.7009620914043584


Accuracy for C = 10: 0.8647058823529412
F1 score for C = 10: 0.6305222067952533


Accuracy for C = 1: 0.8235294117647058
F1 score for C = 1: 0.38707697887026143


Accuracy for C = 1: 0.8579881656804734
F1 score for C = 1: 0.557549297428006


Accuracy for C = 1: 0.8328358208955224
F1 score for C = 1: 0.43516869117324936


Accuracy for C = 10: 0.9131736526946108
F1 score for C = 10: 0.8177766010425799


Accuracy for C = 10: 0.8233532934131736
F1 score for C = 10: 0.4544512005060834


Accuracy for C = 1: 0.8922155688622755
F1 score for C = 1: 0.5259789992827844


svm
Mean accuracy:  0.8534781267804465
Mean f1 score:  0.5853866674757843


In [26]:
t_end = time()
print(t_end - t_start)

229.24115777015686
