In [22]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

import ast
from tqdm import tqdm
tqdm.pandas()

In [23]:
df = pd.read_csv("./../data/processed/data_10.csv")

df['Text'] = df['Text'].progress_apply(lambda x : ast.literal_eval(x))
df['Tags'] = df['Tags'].progress_apply(lambda x : ast.literal_eval(x))

df['Text'] = df['Text'].progress_apply(lambda x : ' '.join(map(str, x)))
df['Tags'] = df['Tags'].progress_apply(lambda x : ' '.join(map(str, x)))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 44296/44296 [00:08<00:00, 5094.10it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 44296/44296 [00:00<00:00, 69700.46it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 44296/44296 [00:00<00:00, 326144.04it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 44296/44296 [00:00<00:00, 1248796.11it/s]


In [24]:
tfidf = TfidfVectorizer()
vectorizer = CountVectorizer(tokenizer = lambda x: x.split(), binary='true', min_df=1)
vectorizer.fit(df['Tags']) #tags
tags_dict = vectorizer.vocabulary_

y = vectorizer.transform(df['Tags']).toarray()

X = tfidf.fit_transform(df['Text']).toarray()
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)



In [27]:
clf = OneVsRestClassifier(SGDClassifier(loss='log_loss', alpha=0.001, penalty='elasticnet'))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [28]:
#### Alpha Optimal regularized
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.03      0.05      1102
           1       0.00      0.00      0.00       592
           2       1.00      0.00      0.00      1501
           3       0.00      0.00      0.00       644
           4       0.00      0.00      0.00       640
           5       0.83      0.01      0.02      1319
           6       0.87      0.02      0.03      1209
           7       1.00      0.03      0.05       995
           8       1.00      0.05      0.09      1314
           9       1.00      0.00      0.01       636

   micro avg       0.96      0.02      0.03      9952
   macro avg       0.67      0.01      0.03      9952
weighted avg       0.77      0.02      0.03      9952
 samples avg       0.02      0.02      0.02      9952



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
#### elasticnet regularized
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.79      0.87      1102
           1       0.83      0.57      0.68       592
           2       0.75      0.58      0.65      1501
           3       0.87      0.44      0.58       644
           4       0.96      0.69      0.81       640
           5       0.86      0.68      0.76      1319
           6       0.76      0.58      0.66      1209
           7       0.90      0.72      0.80       995
           8       0.91      0.74      0.82      1314
           9       0.97      0.74      0.84       636

   micro avg       0.87      0.66      0.75      9952
   macro avg       0.88      0.65      0.75      9952
weighted avg       0.87      0.66      0.75      9952
 samples avg       0.68      0.67      0.67      9952



  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
#### L1 regularized
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.82      0.89      1102
           1       0.78      0.64      0.71       592
           2       0.71      0.62      0.67      1501
           3       0.82      0.57      0.68       644
           4       0.93      0.78      0.85       640
           5       0.90      0.66      0.76      1319
           6       0.80      0.56      0.66      1209
           7       0.88      0.74      0.80       995
           8       0.91      0.74      0.82      1314
           9       0.97      0.78      0.87       636

   micro avg       0.86      0.69      0.76      9952
   macro avg       0.87      0.69      0.77      9952
weighted avg       0.86      0.69      0.76      9952
 samples avg       0.71      0.71      0.70      9952



  _warn_prf(average, modifier, msg_start, len(result))


In [16]:

#### L2 regularized
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.78      0.86      1102
           1       0.85      0.53      0.65       592
           2       0.81      0.45      0.57      1501
           3       0.92      0.36      0.51       644
           4       0.96      0.71      0.82       640
           5       0.91      0.59      0.72      1319
           6       0.84      0.48      0.61      1209
           7       0.91      0.68      0.77       995
           8       0.93      0.65      0.77      1314
           9       0.97      0.71      0.82       636

   micro avg       0.90      0.59      0.71      9952
   macro avg       0.91      0.59      0.71      9952
weighted avg       0.90      0.59      0.71      9952
 samples avg       0.62      0.61      0.61      9952



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
10 Tags

"""
alpha   precision   Recall   F1 Score
0.00001  0.90      0.59      0.71
"""