# 1. Import dataset

In [64]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from tpot import TPOTClassifier

### read data
data = pd.read_csv("/Users/timcerta/code/jbaccarin/xref/raw_data/gcj2008.csv")
# Remove NAs
data = data.dropna()
# Remove code with less than x characters
data = data.loc[data['flines'].str.len() > 5]
# Remove users with entries < 25
data["username"].value_counts()
data = data[data['username'].map(data['username'].value_counts()) > 25].reset_index(drop = True)

#data = data[:1000]
#print(data["flines"])

In [62]:
data["username"].value_counts()


AKTechie      63
Jimb          52
Qingchun      47
amihk         46
bmerry        45
              ..
XiaoZiqian    26
domeng        26
elizarov      26
stone         26
kitamasa      26
Name: username, Length: 122, dtype: int64

# 2. Data Overview

In [65]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3892 entries, 0 to 3891
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  3892 non-null   int64 
 1   year        3892 non-null   int64 
 2   round       3892 non-null   int64 
 3   username    3892 non-null   object
 4   task        3892 non-null   int64 
 5   solution    3892 non-null   int64 
 6   file        3892 non-null   object
 7   full_path   3892 non-null   object
 8   flines      3892 non-null   object
dtypes: int64(5), object(4)
memory usage: 273.8+ KB


In [66]:
data.flines[20]


'\n // Headers {{{\n #include<iostream>\n #include<assert.h>\n #include<cstdio>\n #include<cctype>\n #include<cmath>\n #include<cstdlib>\n #include<algorithm>\n #include<vector>\n #include<string>\n #include<list>\n #include<deque>\n #include<map>\n #include<set>\n #include<queue>\n #include<stack>\n #include<utility>\n #include<sstream>\n #include<cstring>\n #include<bitset>\n #include<numeric>\n using namespace std;\n \n \n #define FOR(I,A,B) for(int I=(A);I<=(B);++I)\n #define FORD(I,A,B) for(int I=(A);I>=(B);--I)\n #define REP(I,N) for(int I=0;I<(N);++I)\n #define VAR(V,init) __typeof(init) V=(init)\n #define FORE(I,C) for(VAR(I,(C).begin());I!=(C).end();++I)\n #define CLR(A,v) memset((A),v,sizeof((A)))\n \n #define SIZE(x) ((int)((x).size()))\n #define ALL(X) (X).begin(),(X).end()\n #define PB push_back\n #define MP make_pair\n #define FI first\n #define SE second\n \n typedef vector<int> VI;\n typedef pair<int,int> PI;\n typedef long long LL;\n typedef vector<string> VS;\n // }}}

# 3 Functions for data preprocessing/ feature selection

## 3.1 Create metrics

In [67]:
from statistics import mean
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from string import punctuation
from collections import Counter

def create_metrics(code):
        """
        Creates metrics needed for modeling
        :return: returns an Array with metrics calculated from the source code
        """
        # TODO: assign vars for redundant code
        # TODO: self.var for all variable assignment

        # Number of characters in code/ length of code
        code_len = len(code)

        # Number of characters in line
        code_tokens = code.split("\n")
        nchar_in_line = mean([code_len for token in code_tokens])

        # Lines of code / file length in chars
        n_lines = code.count('\n')

        # Create n-grams
        #unigram = word2ngrams(text = self.code, n = 1)
        #bigram = word2ngrams(text = self.code, n = 2)
        #trigram = word2ngrams(text = self.code, n = 3)

        # Number of words in the text / file length in characters
        word_len_ratio = len(code.split())/code_len

        # Average number of words per line / file length in characters
        pre = (code.split('\n'))
        words_list = [x.split() for i, x in enumerate(pre)]
        words_count = [len(words_list[i]) for i, x in enumerate(pre)]
        avg_words_per_line = np.mean(words_count)

        # Number of whitespace / file length in characters
        whitespace_ratio = code.count(' ')/code_len

        # Number of line breaks / file length in characters
        linebreak_ratio = code.count('\n')/len(code)

        # Number of indentations / file length in characters
        indent_ratio = code.count('\t')/code_len

        # Number of upper_case words / file length in characters
        uppercase_ratio = sum(1 for char in code if char.isupper())/code_len

        # Number of lower_case words / file length in characters
        lowercase_ratio = sum(1 for char in code if char.islower())/code_len

        # Number of punctuation symbols / file length in characters
        punctuation_count = Counter(punc for line in code for punc in line if punc in punctuation)
        punctuation_count_ratio = sum(punctuation_count.values())/ code_len

        # Average length of words(exluding punctuation, excluding single letter or number)
        #code_no_punc = code
        #for punc in punctuation:
        #    code_no_punc = code_no_punc.replace(punc, '')
        #code_no_single_char = ' '.join([w for w in code_no_punc.split() if len(w)>1])
        #list_len_words = [len(x) for i, x in enumerate(code_no_single_char.split())]
        #avg_char_per_word = np.mean(list_len_words)

        #Keywords count ('if', 'else', 'while', 'for', 'in', 'elif', 'or', 'not', 'with', 'and', 'is')
        keyword_count_ratio = sum([code.count(x) for x in ['if', 'else', 'while', 'for', 'in', 'elif', 'or', 'not', 'with', 'and', 'is'] ]) / code_len

        # Return all metrics as array
        return pd.Series([code_len, nchar_in_line, n_lines, word_len_ratio, avg_words_per_line, whitespace_ratio,linebreak_ratio,indent_ratio,uppercase_ratio,lowercase_ratio, punctuation_count_ratio,keyword_count_ratio])

In [68]:
#metrics = data_small.flines.apply(lambda x:create_metrics(code = x))
metrics = pd.DataFrame()
metrics[["code_len", "nchar_in_line", "n_lines", "word_len_ratio", "avg_words_per_line", "whitespace_ratio","linebreak_ratio","indent_ratio","uppercase_ratio","lowercase_ratio", "punctuation_count_ratio","keyword_count_ratio"]] = data.flines.apply(lambda x:create_metrics(code = x))
metrics

Unnamed: 0,code_len,nchar_in_line,n_lines,word_len_ratio,avg_words_per_line,whitespace_ratio,linebreak_ratio,indent_ratio,uppercase_ratio,lowercase_ratio,punctuation_count_ratio,keyword_count_ratio
0,1143.0,1143.0,46.0,0.196850,4.787234,0.359580,0.040245,0.0,0.017498,0.336833,0.219598,0.036745
1,3300.0,3300.0,122.0,0.187273,5.024390,0.374242,0.036970,0.0,0.024848,0.327879,0.211818,0.025455
2,3300.0,3300.0,122.0,0.187273,5.024390,0.374242,0.036970,0.0,0.024848,0.327879,0.211818,0.025455
3,3022.0,3022.0,126.0,0.178028,4.236220,0.447717,0.041694,0.0,0.007280,0.272336,0.198544,0.030774
4,1658.0,1658.0,62.0,0.193607,5.095238,0.415561,0.037394,0.0,0.016285,0.282871,0.223160,0.037394
...,...,...,...,...,...,...,...,...,...,...,...,...
3887,1926.0,1926.0,81.0,0.111111,2.609756,0.361371,0.042056,0.0,0.021807,0.397196,0.154725,0.032710
3888,797.0,797.0,32.0,0.119197,2.878788,0.343789,0.040151,0.0,0.010038,0.377666,0.181932,0.046424
3889,797.0,797.0,32.0,0.119197,2.878788,0.343789,0.040151,0.0,0.010038,0.377666,0.181932,0.046424
3890,733.0,733.0,33.0,0.117326,2.529412,0.291951,0.045020,0.0,0.004093,0.482947,0.160982,0.047749


# Modeling only with metrics


## Encode Labels

In [69]:
# Encode Label
from sklearn.preprocessing import LabelEncoder

# Instantiate the LabelEncoder
label_encoder = LabelEncoder()

# Fit it to the target
label_encoder.fit(data["username"])

# Find the encoded classes
print(f"The Label Encoder has encoded the classes into {label_encoder.classes_}")

# Transform the targets
data["username_encoded"] = label_encoder.transform(data["username"])

The Label Encoder has encoded the classes into ['ACRush' 'AKTechie' 'AS1' 'Ahyangyi' 'Alexus' 'Amber' 'Astein' 'Bohua'
 'Chmel.Tolstiy' 'DNNX' 'DmitryKlenov' 'Eryx' 'Fire' 'Gluk' 'Hammer'
 'HiltonLange' 'Huayang' 'Innovative.Cat' 'Jacek' 'JanKuipers' 'Jedi'
 'Jimb' 'JongMan' 'KOTEHOK' 'KUNES' 'Klinck' 'LayCurse' 'LinesPrower'
 'Loner' 'LucaB' 'Lunarmony' 'MB.' 'OpenGL' 'PaulJefferys' 'Qingchun'
 'Reid' 'Robinnibor' 'Savior' 'SkidanovAlexander' 'Soultaker' 'TripleM'
 'Vasyl' 'Vedensky' 'Vitaliy' 'Vytis' 'XiaoZiqian' 'Yarin' 'Ying' 'almelv'
 'amihk' 'andersk' 'andrewzta' 'antimatter' 'ardiankp' 'austrin'
 'blueblimp' 'bmerry' 'burunduk3' 'darnley' 'darthur' 'dgozman' 'domeng'
 'dzhulgakov' 'dzwiedziu' 'eagleonhill' 'elizarov' 'falagar' 'ftc'
 'fuwenjie' 'g201513' 'gawry' 'guitarboy' 'gusakov' 'halyavin' 'henshiru'
 'hmich' 'humblefool' 'ilyakor' 'ilyaraz' 'ivan.popelyshev' 'iwi' 'jakubr'
 'jpsbur' 'kinaba' 'kitamasa' 'klopyrev' 'kp7' 'krijgertje' 'ltdtl'
 'lympanda' 'macs' 'misof' 'moran

In [70]:
X = metrics
y = data["username_encoded"]

In [71]:
data["username"].value_counts()

AKTechie      63
Jimb          52
Qingchun      47
amihk         46
bmerry        45
              ..
XiaoZiqian    26
domeng        26
elizarov      26
stone         26
kitamasa      26
Name: username, Length: 122, dtype: int64

In [72]:
X.isna().sum()

code_len                   0
nchar_in_line              0
n_lines                    0
word_len_ratio             0
avg_words_per_line         0
whitespace_ratio           0
linebreak_ratio            0
indent_ratio               0
uppercase_ratio            0
lowercase_ratio            0
punctuation_count_ratio    0
keyword_count_ratio        0
dtype: int64

In [73]:

clf = MultinomialNB()
print(cross_val_score(clf, X, y, cv=5))
#clf.fit(X, y)


[0.07445443 0.09499358 0.08868895 0.08868895 0.07712082]


In [22]:
# Instantiate TPOTClassifier
tpot = TPOTClassifier(generations=5, population_size=1000, verbosity=2, scoring='recall', n_jobs=-1, cv=5)

# Process autoML with TPOT
tpot.fit(X, y)

# Print score
print(tpot.score(X, y))

In [None]:
#metrics = metrics.to_numpy()
metrics.shape

(1000, 13)

## 3.2 Create Count Vectorization for text + ngrams

In [74]:
def count_vectorizer(flines, analyzer = 'word', ngram_range = (1, 1), min_df = 3):
    """
    Apply count vectorizer to a given column (flines)
    :param analyzer:
        if 'word': vectorization happens with word n-grams
        if 'char': vectorization happens with character n-grams, padding the empty space to the tokens
        if 'char_wb': creates character n-grams only from text inside word boundaries
    :param ngram_range: the lower and upper boundary of the range of n-values for different word n-grams or char n-grams to be extracted.
        examples: (1, 1) extracts unigrams, (1, 2) extracts unigrams and bigrams
    :param min_df: remove terms with frequency lower than threshold
    :return: array of count_frequencies
    """
    count_vectorizer = CountVectorizer(analyzer = analyzer, ngram_range = ngram_range)
    X = count_vectorizer.fit_transform(flines).toarray()
    return X

## Apply count_vectorizer to word

In [75]:
word_count_vec = count_vectorizer(flines = data_small["flines"])
word_count_vec

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## Apply count_vectorizer to n-grams

In [76]:
ngram_count_vec = count_vectorizer(flines = data_small["flines"], analyzer = "char", ngram_range = (1, 3))
ngram_count_vec

array([[ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [11,  0,  0, ...,  0,  0,  0]])

In [8]:
metrics.shape, word_count_vec.shape, ngram_count_vec.shape

((3892, 12), (1000, 6598), (1000, 35619))

In [9]:
#np.append([metrics, word_count_vec, ngram_count_vec], values = 1)

res = np.concatenate( (metrics, word_count_vec, ngram_count_vec), axis = 1)

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 3892 and the array at index 1 has size 1000

# Modeling

In [78]:
# Define X and y
y = data["username_encoded"]
X = word_count_vec

In [79]:
clf = MultinomialNB()
print(cross_val_score(clf, X, y, cv=5))
#clf.fit(X, y)


ValueError: Found input variables with inconsistent numbers of samples: [1000, 3892]

In [None]:
# Instantiate TPOTClassifier
tpot = TPOTClassifier(generations=4, population_size=20, verbosity=2, scoring='recall', n_jobs=-1, cv=5)

# Process autoML with TPOT
tpot.fit(X, y)

# Print score
print(tpot.score(X, y))


Imputing missing values in feature set


ValueError: Error: Input data is not in a valid format. Please confirm that the input data is scikit-learn compatible. For example, the features must be a 2-D array and target labels must be a 1-D array.

# Calculate T-Pot

In [None]:
import os
from tpot import TPOTClassifier

X = df2
y = data_small["username"]

# Instantiate TPOTClassifier
tpot = TPOTClassifier(generations=4, population_size=20, verbosity=2, scoring='recall', n_jobs=-1, cv=5)

# Process autoML with TPOT
tpot.fit(X, y)

# Print score
print(tpot.score(X, y))

Imputing missing values in feature set


Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]




Generation 1 - Current best internal CV score: -inf


ValueError: n_splits=5 cannot be greater than the number of members in each class.