In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC
from sklearn.metrics import jaccard_score
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from gensim.models import Word2Vec
import torch
import time
from transformers import BertTokenizer, BertModel
import tensorflow as tf
import tensorflow_hub as hub




In [2]:
df = pd.read_csv('X_clean.csv')

In [3]:
print(df.head())

   Unnamed: 0                                              title  score  \
0           0  How can I use optional chaining with arrays an...    321   
1           1  What is the use of PYTHONUNBUFFERED in docker ...    308   
2           2  IntelliJ: Error:java: error: release version 5...    282   
3           3    Maven dependencies are failing with a 501 error    220   
4           4  react-testing-library: some portion of debug&#...    205   

                                                tags        creation_date  \
0  javascript,arrays,typescript,function,optional...  2020-01-07T08:05:02   
1                           django,docker,dockerfile  2020-01-19T17:23:11   
2                                 java,intellij-idea  2020-01-05T15:54:15   
3                   java,maven,jenkins,maven-central  2020-01-16T06:31:52   
4    javascript,reactjs,jestjs,react-testing-library  2020-01-16T06:54:02   

   is_english                       cleaned_title  \
0        True  optional chaining 

In [4]:
y_raw = df['tags'].apply(lambda x: x.split(','))
tag_counter = Counter([tag for tags in y_raw for tag in tags])
most_common_tags = [tag for tag, _ in tag_counter.most_common(100)]
df['filtered_tags'] = y_raw.apply(lambda tags: [tag for tag in tags if tag in most_common_tags])

In [5]:
print(df.shape)
df.dropna(subset=['filtered_tags'], inplace=True)
data = df[df['filtered_tags'].apply(len) > 0]
print(data.shape)

(55813, 10)
(50633, 10)


In [6]:
X = data['cleaned_title']
y_raw = data['filtered_tags']
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y_raw)
tag_distribution = y.sum(axis=0)
constant_tags = tag_distribution == len(y)
y = y[:, ~constant_tags]


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
class FeatureExtractor:
    def __init__(self):
        return

    def bow(self):
        vectorizer_bow = CountVectorizer()
        self.X_train_bow = vectorizer_bow.fit_transform(self.X_train)
        self.X_test_bow = vectorizer_bow.transform(self.X_test)
        
    def w2v(self):
        tokenized_corpus = [text.split() for text in self.X_train]
        w2v_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)
        self.X_train_w2v = np.array([np.mean([w2v_model.wv[word] for word in sentence.split() if word in w2v_model.wv] \
                                or [np.zeros(100)], axis=0) for sentence in self.X_train])
        self.X_test_w2v = np.array([np.mean([w2v_model.wv[word] for word in sentence.split() if word in w2v_model.wv] \
                               or [np.zeros(100)], axis=0) for sentence in self.X_test])
    
    def bert(self, batch_size=20480):
        def encode(texts):
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
            model = BertModel.from_pretrained('bert-base-uncased')
            all_embeddings = []
            for i in range(0, len(texts), batch_size):
                batch_texts = texts[i:i+batch_size]
                inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
                with torch.no_grad(): 
                    outputs = model(**inputs)
                embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
                all_embeddings.append(embeddings)
            return np.vstack(all_embeddings)
        self.X_train_bert = encode(self.X_train)
        self.X_test_bert = encode(self.X_test)
    
    def use(self):
        def encode(texts):
            model = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
            return model(texts).numpy()
        self.X_train_use = encode(self.X_train)
        self.X_test_use = encode(self.X_test)
    
    def handle_missing_values(self):
        self.X_train = self.X_train.fillna('').astype(str).tolist()
        self.X_test = self.X_test.fillna('').astype(str).tolist()
    
    def transform(self, X_train, y_train, X_test, y_test):
        self.X_test = X_test
        self.y_train = y_train
        self.X_train = X_train
        self.y_test = y_test
        self.handle_missing_values()
        self.bow()
        self.w2v()
        self.bert()
        self.use()
        data = {
            'bow': (self.X_train_bow, self.X_test_bow),
            'w2v': (self.X_train_w2v, self.X_test_w2v),
            'bert': (self.X_train_bert, self.X_test_bert),
            'use': (self.X_train_use, self.X_test_use)
        }
        return data

In [None]:
data_dict = FeatureExtractor().transform(X_train, y_train, X_test, y_test)

In [15]:
class Model:
    def __init__(self, X_train, y_train, X_test, y_test, data):
        self.X_test = X_test
        self.y_train = y_train
        self.X_train = X_train
        self.y_test = y_test
        self.data = data
    
    def precision_at_n(self, y_true, y_pred, n=10):
        """Calculate precision at N."""
        precisions = []
        for true, pred in zip(y_true, y_pred):
            # True positive count in top N
            true_positives = np.sum(np.isin(np.where(true == 1)[0], np.where(pred == 1)[0]))
            precisions.append(true_positives / min(n, np.sum(pred)))
        return np.mean(precisions)

    def recall_at_n(self, y_true, y_pred, n=10):
        """Calculate recall at N."""
        recalls = []
        for true, pred in zip(y_true, y_pred):
            # True positive count in top N
            true_positives = np.sum(np.isin(np.where(true == 1)[0], np.where(pred == 1)[0]))
            recalls.append(true_positives / np.sum(true))
        return np.mean(recalls)
       
    def train_and_evaluate(self, X_train, X_test, y_train, y_test, model, model_name, vectorizer_name, n_tag=10):
        start_time = time.time()
        clf = OneVsRestClassifier(model)
        
        clf.fit(X_train, y_train)
        training_duration = time.time() - start_time
        prediction_start_time = time.time()
        y_pred_proba = clf.predict_proba(X_test)
        y_pred = clf.predict(X_test)
        y_pred_top_n = np.argsort(-y_pred_proba, axis=1)[:, :n_tag]
        y_n = np.zeros_like(y_pred_proba)
        for i in range(y_pred_top_n.shape[0]):
            y_n[i, y_pred_top_n[i]] = 1
            
        prediction_duration = time.time() - prediction_start_time

        precision = self.precision_at_n(y_test, y_n, n=n_tag)
        recall = self.recall_at_n(y_test, y_n, n=n_tag)
        score = jaccard_score(y_test, y_pred, average='samples')
        print(f'Jaccard Score ({vectorizer_name} + {model_name}): {score}')
        
        print(f'Precision@{n_tag} ({vectorizer_name} + {model_name}): {precision}')
        print(f'Recall@{n_tag} ({vectorizer_name} + {model_name}): {recall}')
        
        input_example = X_train[:1]
        signature = infer_signature(X_train, clf.predict(X_train))

        with mlflow.start_run():
            mlflow.log_param("vectorizer", vectorizer_name)
            mlflow.log_param("model", model_name)
            mlflow.log_metric(f"precision_at_{n_tag}", precision)
            mlflow.log_metric(f"recall_at_{n_tag}", recall)
            mlflow.log_metric("jaccard_score", score)
            mlflow.log_metric("training_duration", training_duration)
            mlflow.log_metric("prediction_duration", prediction_duration)
            mlflow.sklearn.log_model(clf, "model", signature=signature, input_example=input_example)

        return clf, X_test, y_pred

    def plot_pca_tsne(self, X, y_pred, vectorizer_name, method='PCA'):
        """
        Visualize the results using PCA or t-SNE.
        """
        if method == 'PCA':
            if hasattr(X, 'todense'):
                reducer = TruncatedSVD(n_components=2)
            else:
                reducer = PCA(n_components=2)
            title = f'PCA Visualization ({vectorizer_name})'
        elif method == 't-SNE':
            if hasattr(X, 'todense'):
                 X = np.asarray(X.todense())
            reducer = TSNE(n_components=2, random_state=42)
            title = f't-SNE Visualization ({vectorizer_name})'
        else:
            raise ValueError("Method must be 'PCA' or 't-SNE'")
        
        X_reduced = reducer.fit_transform(X)
        
        plt.figure(figsize=(10, 7))
        plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y_pred.argmax(axis=1), cmap='viridis', s=10)
        plt.title(title)
        plt.xlabel(f"{method} Component 1")
        plt.ylabel(f"{method} Component 2")
        plt.colorbar()
        plt.show()
            
    def run(self, models):
        for model_name, model in models.items():
            for vectorizer_name, (X_train, X_test) in self.data.items():
                clf, X_test, y_pred = self.train_and_evaluate(X_train, X_test, self.y_train, self.y_test, model, model_name, vectorizer_name)
                self.plot_pca_tsne(X_test, y_pred, vectorizer_name, method='PCA')
                self.plot_pca_tsne(X_test, y_pred, vectorizer_name, method='t-SNE')
            

In [16]:
model = Model(X_train, y_train, X_test, y_test, data_dict)


In [21]:
models = {
            "SGD Classifier": CalibratedClassifierCV(SGDClassifier(random_state=42, n_jobs=10), method='sigmoid', cv=5),
            "MultinomialNB": MultinomialNB(),
            # "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=10),
            # "Logistic Regression": LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42, n_jobs=10),
            #"SVM": SVC(probability=True),
        }
model.run(models)

Jaccard Score (bow + SGD Classifier): 0.459744577202857
Precision@10 (bow + SGD Classifier): 0.1458181100029624
Recall@10 (bow + SGD Classifier): 0.8362446924064382


KeyboardInterrupt: 

In [None]:
X = data['lm']
y_raw = data['filtered_tags']
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y_raw)
tag_distribution = y.sum(axis=0)
constant_tags = tag_distribution == len(y)
y = y[:, ~constant_tags]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
data_dict = FeatureExtractor().transform(X_train, y_train, X_test, y_test)

In [None]:
model = Model(X_train, y_train, X_test, y_test, data_dict)

In [None]:
models = {
            "SGD Classifier": CalibratedClassifierCV(SGDClassifier(random_state=42, n_jobs=10), method='sigmoid', cv=5),
            "MultinomialNB": MultinomialNB(),
            "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=10),
            "Logistic Regression": LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42, n_jobs=10),
            #"SVM": SVC(probability=True),
        }
model.run(models)