In [46]:
# Multi Label Pkgs
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.adapt import MLkNN
### Split Dataset into Train and Text
from sklearn.model_selection import train_test_split
# Feature engineering
from sklearn.feature_extraction.text import TfidfVectorizer
# ML Pkgs
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import accuracy_score,hamming_loss,classification_report
# Load Data Viz Pkgs
import matplotlib.pyplot as plt
import seaborn as sns
# Load EDA Pkgs
import pandas as pd
import numpy as np

In [47]:
df = pd.read_csv("data.csv")

In [48]:
df.head()

Unnamed: 0,Paper_ID,Question_ID,Question_Type,Question_Text,Token_Text,Word_Count
0,0460_m17_qp_22.pdf,1,MAIN,"Study the map extract for Ballyvaghan, Ireland...",Study map extract Ballyvaghan Ireland scale 1:50,71
1,0460_m17_qp_22.pdf,(a),SUB,The map has blue grid lines which make square...,map blue grid line make square area land one g...,126
2,0460_m17_qp_22.pdf,10,MAIN,km2,,4
3,0460_m17_qp_22.pdf,20,MAIN,km2,,4
4,0460_m17_qp_22.pdf,50,MAIN,km2 100 km2,,12


In [49]:
df.dtypes

Paper_ID         object
Question_ID      object
Question_Type    object
Question_Text    object
Token_Text       object
Word_Count        int64
dtype: object

In [50]:
df['Token_Text']

0        Study map extract Ballyvaghan Ireland scale 1:50
1       map blue grid line make square area land one g...
2                                                     NaN
3                                                     NaN
4                                                     NaN
                              ...                        
2699      Use information Table complete pie chart mammal
2700    Using compare level threat bird coniferous tre...
2701    newspaper article impact global warming crisis...
2702                             year dodo become extinct
2703    Global warming major factor extinction specie ...
Name: Token_Text, Length: 2704, dtype: object

### Text Preprocessing
+ neattext : remove_stopwords
+ pip install neattext

In [51]:
import neattext as nt
import neattext.functions as nfx

In [52]:
# Explore For Noise
df['Question_Text'].apply(lambda x:nt.TextFrame(x).noise_scan())

0       {'text_noise': 11.267605633802818, 'text_lengt...
1       {'text_noise': 11.11111111111111, 'text_length...
2       {'text_noise': 0, 'text_length': 4, 'noise_cou...
3       {'text_noise': 0, 'text_length': 4, 'noise_cou...
4       {'text_noise': 0, 'text_length': 12, 'noise_co...
                              ...                        
2699    {'text_noise': 16.304347826086957, 'text_lengt...
2700    {'text_noise': 12.280701754385964, 'text_lengt...
2701    {'text_noise': 11.470113085621971, 'text_lengt...
2702    {'text_noise': 13.636363636363635, 'text_lengt...
2703    {'text_noise': 10.084033613445378, 'text_lengt...
Name: Question_Text, Length: 2704, dtype: object

In [53]:
# Explore For Noise
df['Question_Text'].apply(lambda x:nt.TextExtractor(x).extract_stopwords())

0                                     [the, for, the, is]
1       [the, has, which, make, what, of, does, one, one]
2                                                      []
3                                                      []
4                                                      []
                              ...                        
2699                              [the, in, to, the, for]
2700         [using, the, of, to, and, do, not, in, your]
2701    [is, a, about, the, of, a, that, we, cannot, t...
2702                        [in, which, did, the, become]
2703        [is, a, in, the, of, one, other, of, the, of]
Name: Question_Text, Length: 2704, dtype: object

In [54]:
# Explore For Noise
df['Question_Text'].apply(nfx.remove_stopwords)

0       Study map extract Ballyvaghan, Ireland. scale ...
1       map blue grid lines squares. area land grid sq...
2                                                     km2
3                                                     km2
4                                             km2 100 km2
                              ...                        
2699    (i) Use information Table 6.1 complete pie cha...
2700    Fig. 6.1, compare level threat birds coniferou...
2701    Fig. 6.2 newspaper article impact global warmi...
2702                                   year dodo extinct?
2703    Global warming major factor extinction species...
Name: Question_Text, Length: 2704, dtype: object

In [55]:
corpus = df['Question_Text'].apply(nfx.remove_stopwords)

### Feature Engineering
+ Build features from our text
+ TFIDF,countvectorizer,bow

In [56]:
tfidf = TfidfVectorizer()

In [57]:
# Build Features
Xfeatures = tfidf.fit_transform(corpus).toarray()

In [58]:
Xfeatures

array([[0.        , 0.31890679, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [60]:
corpus.head()

0    Study map extract Ballyvaghan, Ireland. scale ...
1    map blue grid lines squares. area land grid sq...
2                                                  km2
3                                                  km2
4                                          km2 100 km2
Name: Question_Text, dtype: object

In [28]:
for val in df['Paper_ID']:
    print(val)

0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m17_qp_22.pdf
0460_m18_qp_22.pdf
0460_m18_qp_22.pdf
0460_m18_qp_22.pdf
0460_m18_qp_22.pdf
0460_m18_qp_22.pdf
0460_m18_qp_22.pdf
0460_m18_qp_

In [61]:
y = df[['Paper_ID', 'Question_Text', 'php']]

KeyError: "['php'] not in index"

In [23]:
# Split Data 
X_train,X_test,y_train,y_test = train_test_split(Xfeatures,y,test_size=0.3,random_state=42)

NameError: name 'y' is not defined

In [34]:
print(df['title'].shape)
print(X_train.shape)

(144,)
(100, 404)


In [None]:
# Building Our Model
# Estimator + Multilabel Estimator

In [35]:
### Problem Transform
import skmultilearn

In [36]:
dir(skmultilearn)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'adapt',
 'base',
 'problem_transform',
 'utils']

### Binary Relevance classficiation
+ Convert Our Multi-Label Prob to Multi-Class

![](binary_relevance_multilabel_ml_jcharistech.png)

In [37]:
# Convert Our Multi-Label Prob to Multi-Class
# binary classficiation
binary_rel_clf = BinaryRelevance(MultinomialNB())

In [38]:
binary_rel_clf.fit(X_train,y_train)

BinaryRelevance(classifier=MultinomialNB(alpha=1.0, class_prior=None,
                                         fit_prior=True),
                require_dense=[True, True])

In [39]:
# Predictions
br_prediction = binary_rel_clf.predict(X_test)

In [41]:
br_prediction

<44x3 sparse matrix of type '<class 'numpy.float64'>'
	with 88 stored elements in Compressed Sparse Column format>

In [42]:
# Convert to Array  To See Result
br_prediction.toarray()

array([[1., 0., 1.],
       [1., 0., 1.],
       [1., 1., 0.],
       [1., 0., 1.],
       [1., 0., 1.],
       [1., 0., 1.],
       [1., 0., 1.],
       [1., 1., 0.],
       [1., 1., 0.],
       [1., 0., 1.],
       [1., 0., 1.],
       [1., 0., 1.],
       [1., 0., 1.],
       [1., 1., 0.],
       [1., 0., 1.],
       [1., 0., 1.],
       [1., 0., 1.],
       [1., 0., 1.],
       [1., 0., 1.],
       [1., 1., 0.],
       [1., 1., 0.],
       [1., 1., 0.],
       [1., 0., 1.],
       [1., 0., 1.],
       [1., 0., 1.],
       [1., 0., 1.],
       [1., 0., 1.],
       [1., 0., 1.],
       [1., 0., 1.],
       [1., 1., 0.],
       [1., 1., 0.],
       [1., 1., 0.],
       [1., 0., 1.],
       [1., 1., 0.],
       [1., 1., 0.],
       [1., 1., 0.],
       [1., 0., 1.],
       [1., 0., 1.],
       [1., 1., 0.],
       [1., 0., 1.],
       [1., 1., 0.],
       [1., 0., 1.],
       [1., 1., 0.],
       [1., 1., 0.]])

In [43]:
# Accuracy
accuracy_score(y_test,br_prediction)

0.9090909090909091

In [44]:
# Hamming Loss :Incorrect Predictions
# The Lower the result the better
hamming_loss(y_test,br_prediction)

0.06060606060606061

#### Classifier Chains
+ Preserve Label Correlation

![](classifier_chains_multilabel_jcharistech.png)

In [46]:
def build_model(model,mlb_estimator,xtrain,ytrain,xtest,ytest):
    # Create an Instance
    clf = mlb_estimator(model)
    clf.fit(xtrain,ytrain)
    # Predict
    clf_predictions = clf.predict(xtest)
    # Check For Accuracy
    acc = accuracy_score(ytest,clf_predictions)
    ham = hamming_loss(ytest,clf_predictions)
    result = {"accuracy:":acc,"hamming_score":ham}
    return result

In [47]:
clf_chain_model = build_model(MultinomialNB(),ClassifierChain,X_train,y_train,X_test,y_test)

In [48]:
clf_chain_model

{'accuracy:': 0.8409090909090909, 'hamming_score': 0.10606060606060606}

#### LabelPowerset
![](labelPowerset_multilabel_ml_jcharistech.png)

In [49]:
clf_labelP_model = build_model(MultinomialNB(),LabelPowerset,X_train,y_train,X_test,y_test)

In [50]:
clf_labelP_model

{'accuracy:': 0.9090909090909091, 'hamming_score': 0.06060606060606061}

In [None]:
### Apply On A Simple Ttitle/Question

In [52]:
ex1 = df['title'].iloc[0]

In [58]:
# Vectorized 
vec_example = tfidf.transform([ex1])

In [60]:
# Make our prediction
binary_rel_clf.predict(vec_example).toarray()

array([[1., 1., 0.]])

In [72]:
import joblib

In [73]:
# Save Model
binary_rel_clf_file = open("binary_rel_clf_model_file.pkl","wb")
joblib.dump(binary_rel_clf,binary_rel_clf_file)
binary_rel_clf_file.close()

In [74]:
# Save Vectorizer
tfidf_vectorizer_file = open("tfidf_vectorizer_SO_tags_file.pkl","wb")
joblib.dump(tfidf,tfidf_vectorizer_file)
tfidf_vectorizer_file.close()

In [61]:
#### Adapted Algorithm
from skmultilearn.adapt import MLkNN

In [70]:
### Thanks For Watching 
### Jesus Saves @JCharisTech
### Jesse E.Agbe(JCharis)