In [23]:
import ast
import pickle
import pandas as pd
# Load the TF-IDF matrix from the file
with open("fe_data/tfidf_title_filtered_15_score5.pkl", "rb") as f:
    x_title = pickle.load(f)
    
with open("fe_data/tfidf_body_filtered_15_score5.pkl", "rb") as f:
    x_body = pickle.load(f)
    
print(x_title)


  (0, 26702)	0.489380277133263
  (0, 27034)	0.41926285164054683
  (0, 15778)	0.4559248454595617
  (0, 3613)	0.5124051539348506
  (0, 10838)	0.33808149772042295
  (1, 15403)	0.6344013808050604
  (1, 23762)	0.6230025507858479
  (1, 2338)	0.4576054083453997
  (2, 28912)	0.6850346470481155
  (2, 4996)	0.4526987345115973
  (2, 5959)	0.45677526920928146
  (2, 10148)	0.3422685225872382
  (3, 1996)	0.32919800586836895
  (3, 16936)	0.3590345688782665
  (3, 10155)	0.532173517991191
  (3, 22811)	0.5627457356837965
  (3, 1204)	0.4035237725354838
  (4, 4111)	0.5839943760309615
  (4, 4562)	0.4164902592883401
  (4, 16932)	0.5803215107818112
  (4, 27943)	0.3856337340094297
  (5, 23204)	0.357997599617262
  (5, 28740)	0.33780168029528523
  (5, 5499)	0.5487735315367471
  (5, 11562)	0.6757035996223842
  :	:
  (65222, 27943)	0.2895500352692866
  (65223, 13331)	0.5561740983053443
  (65223, 13321)	0.44712217678425425
  (65223, 4631)	0.44071812544022715
  (65223, 20626)	0.2379855700800746
  (65223, 28700)	0.2

In [18]:
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split


df = pd.read_csv("data/clean_data_filtered_15_score5_tokenized.csv")
df['tag'] = df['tag'].apply(ast.literal_eval)




X = hstack([x_title, x_body])
y = df['tag']


multi_label_binarizer = MultiLabelBinarizer()
y = multi_label_binarizer.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)
#X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=0)

y_classes = multi_label_binarizer.classes_

### Metrics
- **Precision**: Precision is the ratio of true positive predictions to the total number of positive predictions made by the classifier. It measures the accuracy of positive predictions. A high precision indicates that the classifier has a low false positive rate.

- **Recall**: Recall, also known as sensitivity or true positive rate, is the ratio of true positive predictions to the total number of actual positives in the dataset. It measures the ability of the classifier to correctly identify all positive instances. A high recall indicates that the classifier has a low false negative rate.

- **F-score**: The F-score, or F1 score, is the harmonic mean of precision and recall. It provides a single score that balances both precision and recall. The F1 score reaches its best value at 1 and worst at 0.

- **Support**: Support is the number of actual occurrences of each class in the specified dataset. It represents the number of true instances for each class in the dataset.

- **Hamming Loss**: Hamming loss is the fraction of labels that are incorrectly predicted. It computes the fraction of labels that are incorrectly predicted, i.e., the fraction of the wrong labels to the total number of labels.

- **Jaccard Score**: The Jaccard score, also known as the Jaccard similarity coefficient, measures the similarity between two sets by comparing their intersection with their union. In the context of multi-label classification, it calculates the similarity between the predicted labels and the true labels. It ranges from 0 to 1, where 1 indicates perfect overlap between the predicted and true labels.

In [21]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import hamming_loss
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, jaccard_score
from sklearn.metrics import precision_recall_fscore_support as score


    

# Prints Accuracy, Hamming loss and Jaccard score:
def evaluate_model(y_test, y_pred, model_name):
    hamming = []
    precision, recall, fscore, support = score(y_test, y_pred)
    

    for i, (test, pred) in enumerate(zip(y_test.T, y_pred.T)):
        hamming.append(hamming_loss(test, pred))
    
    acc = accuracy_score(y_test, y_pred)
    jacc_sc = jaccard_score(y_test, y_pred, average='weighted')
    hamming_ls = hamming_loss(y_test,y_pred)
    
    print(f"Hamming Loss: {hamming_ls:.4f}")
    print("Accuracy: ", acc)
    print("Classifier Used:", str(model_name))
    print(f'Jaccard Score: {jacc_sc:.4f}')
    print("\n")
    
    metric_df = pd.DataFrame(data=[precision, recall, fscore, hamming,support],
                         index=["Precision", "Recall", "F-1 score", "Hamming loss","True Count"],
                         columns=y_classes)
    metric_df.to_csv("models/models_results/"+str(model_name)+"_metrics.xlsx")
    
    #top_ten_tags = ["javascript", "java", "c#", "php", "android", "jquery", "python", "html", "c++", "ios","mysql","css","sql","asp.net","objective-c"]
    print(metric_df)




In [22]:

sgd = SGDClassifier(n_jobs=-1)
svc = LinearSVC()

for classifier in [sgd,svc]:
    #top_ten_tags = ["javascript", "java", "c#", "php", "android", "jquery", "python", "html", "c++", "ios","mysql","css","sql","asp.net","objective-c"]

    clf = OneVsRestClassifier(classifier)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    metric_df = evaluate_model(y_test, y_pred, classifier)

Hamming Loss: 0.0402
Accuracy:  0.5206156611616672
Classifier Used: SGDClassifier(n_jobs=-1)
Jaccard Score: 0.5248


                    .net     android           c           c#         c++  \
Precision       0.794393    0.980867    0.815534     0.833333    0.920732   
Recall          0.163148    0.786299    0.249258     0.497495    0.520092   
F-1 score       0.270701    0.872872    0.381818     0.623039    0.664710   
Hamming loss    0.051455    0.025166    0.030558     0.094484    0.051343   
True Count    521.000000  978.000000  337.000000  1397.000000  871.000000   

                     css        html         ios      iphone         java  \
Precision       0.936090    0.724138    0.809969    0.890000     0.961411   
Recall          0.656992    0.311881    0.488722    0.253561     0.638828   
F-1 score       0.772093    0.435986    0.609613    0.394678     0.767606   
Hamming loss    0.016515    0.036625    0.037412    0.030671     0.059319   
True Count    379.000000  404.00000



Hamming Loss: 0.0376
Accuracy:  0.5681384114144478
Classifier Used: LinearSVC()
Jaccard Score: 0.5856


                    .net     android           c           c#         c++  \
Precision       0.541833    0.975177    0.684492     0.798719    0.831563   
Recall          0.261036    0.843558    0.379822     0.624911    0.629162   
F-1 score       0.352332    0.904605    0.488550     0.701205    0.716340   
Hamming loss    0.056173    0.019548    0.030109     0.083586    0.048759   
True Count    521.000000  978.000000  337.000000  1397.000000  871.000000   

                     css        html         ios      iphone         java  \
Precision       0.887789    0.607287    0.761421    0.680851     0.934866   
Recall          0.709763    0.371287    0.563910    0.364672     0.715018   
F-1 score       0.788856    0.460829    0.647948    0.474954     0.810295   
Hamming loss    0.016178    0.039434    0.036625    0.031794     0.051343   
True Count    379.000000  404.000000  532.000000