In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
# Load Data
train_df = pd.read_csv("../prod_corpus/csharp_train_shuffle.csv",encoding='gb2312')
test_df = pd.read_csv("../prod_corpus/csharp_valid_shuffle.csv",encoding='gb2312')
valid_df = pd.read_csv("../prod_corpus/csharp_test_shuffle.csv",encoding='gb2312')
y_test = np.zeros(len(test_df))

In [3]:
def evaluate_recall(y, y_test, k=1):
    num_examples = float(len(y))
    num_correct = 0
    for predictions, label in zip(y, y_test):
        if label in predictions[:k]:
            num_correct += 1
    return num_correct/num_examples

In [4]:
def predict_random(query, code):
    return np.random.choice(len(code), 10, replace=False)

In [5]:
# Evaluate Random predictor
y_random = [predict_random(test_df['query'][x], test_df.iloc[x,1:].values) for x in range(len(test_df))]
for n in [1, 2, 5, 6,10]:
    print("Recall @ (%d, 10): %f"%(n, evaluate_recall(y_random, y_test, n)))

Recall @ (1, 10): 0.096603
Recall @ (2, 10): 0.192903
Recall @ (5, 10): 0.499545
Recall @ (6, 10): 0.595693
Recall @ (10, 10): 1.000000


In [6]:
class TFIDFPredictor:
    def __init__(self):
        self.vectorizer = TfidfVectorizer()

    def train(self, data):
        self.vectorizer.fit(np.append(data['query'].values,data['code'].values))

    def predict(self, query, code):
        # Convert context and utterances into tfidf vector
        vector_query = self.vectorizer.transform([query])
        vector_code  = self.vectorizer.transform(code)
        # The dot product measures the similarity of the resulting vectors
        result = np.dot(vector_code, vector_query.T).todense()
        result = np.asarray(result).flatten()
        # Sort by top results and return tvalidhe indices in descending order
        return np.argsort(result, axis=0)[::-1]

In [7]:
# Evaluate TFIDF predictor
pred = TFIDFPredictor()
pred.train(train_df)
y = [pred.predict(test_df['query'][x], test_df.iloc[x,1:].values) for x in range(len(test_df))]
for n in [1, 2, 5, 6, 10]:
    print("Recall @ (%d, 10): %f}"%(n, evaluate_recall(y, y_test, n)))

Recall @ (1, 10): 0.394146}
Recall @ (2, 10): 0.446315}
Recall @ (5, 10): 0.475887}
Recall @ (6, 10): 0.476039}
Recall @ (10, 10): 1.000000}
