In [29]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from lazytext.supervised import LazyTextPredict
import re

# Load the dataset
df = pd.read_csv("/home/jay-vala/python projects/lazytext/tests/assets/bbc-text.csv")
df.dropna(inplace=True)


# clean the dataset
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /home/jay-vala/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jay-
[nltk_data]     vala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jay-vala/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jay-vala/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [30]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.3, random_state=13)

In [32]:
df_train

Unnamed: 0,category,text
2007,tech,new delay hits eu software laws a fresh delay ...
2172,sport,middlesbrough 2-2 charlton a late header by te...
2187,sport,jones files conte lawsuit marion jones has fil...
1878,sport,wales hails new superstar one game into his si...
1414,entertainment,doves soar to uk album summit manchester rock ...
...,...,...
153,business,german business confidence slides german busin...
1780,entertainment,hobbit picture four years away lord of the r...
866,entertainment,south bank awards honour hit soap coronation s...
74,business,call to save manufacturing jobs the trades uni...


In [33]:
# Tokenize the words
df_train['clean_text'] = df_train['text'].apply(nltk.word_tokenize)
df_test['clean_text'] = df_test['text'].apply(nltk.word_tokenize)

In [34]:
# Remove stop words
stop_words=set(nltk.corpus.stopwords.words("english"))
df_train['text_clean'] = df_train['clean_text'].apply(lambda x: [item for item in x if item not in stop_words])
df_test['text_clean'] = df_test['clean_text'].apply(lambda x: [item for item in x if item not in stop_words])

In [35]:
# Remove numbers, punctuation and special characters (only keep words)
regex = '[a-z]+'
df_train['text_clean'] = df_train['text_clean'].apply(lambda x: [item for item in x if re.match(regex, item)])
df_test['text_clean'] = df_test['text_clean'].apply(lambda x: [item for item in x if re.match(regex, item)])

In [36]:
df_train

Unnamed: 0,category,text,clean_text,text_clean
2007,tech,new delay hits eu software laws a fresh delay ...,"[new, delay, hits, eu, software, laws, a, fres...","[new, delay, hits, eu, software, laws, fresh, ..."
2172,sport,middlesbrough 2-2 charlton a late header by te...,"[middlesbrough, 2-2, charlton, a, late, header...","[middlesbrough, charlton, late, header, teenag..."
2187,sport,jones files conte lawsuit marion jones has fil...,"[jones, files, conte, lawsuit, marion, jones, ...","[jones, files, conte, lawsuit, marion, jones, ..."
1878,sport,wales hails new superstar one game into his si...,"[wales, hails, new, superstar, one, game, into...","[wales, hails, new, superstar, one, game, six,..."
1414,entertainment,doves soar to uk album summit manchester rock ...,"[doves, soar, to, uk, album, summit, mancheste...","[doves, soar, uk, album, summit, manchester, r..."
...,...,...,...,...
153,business,german business confidence slides german busin...,"[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,..."
1780,entertainment,hobbit picture four years away lord of the r...,"[hobbit, picture, four, years, away, lord, of,...","[hobbit, picture, four, years, away, lord, rin..."
866,entertainment,south bank awards honour hit soap coronation s...,"[south, bank, awards, honour, hit, soap, coron...","[south, bank, awards, honour, hit, soap, coron..."
74,business,call to save manufacturing jobs the trades uni...,"[call, to, save, manufacturing, jobs, the, tra...","[call, save, manufacturing, jobs, trades, unio..."


In [37]:
# Lemmatization
lem = nltk.stem.wordnet.WordNetLemmatizer()
df_train['text_clean'] = df_train['text_clean'].apply(lambda x: [lem.lemmatize(item, pos='v') for item in x])
df_test['text_clean'] = df_test['text_clean'].apply(lambda x: [lem.lemmatize(item, pos='v') for item in x])

In [38]:
df_train

Unnamed: 0,category,text,clean_text,text_clean
2007,tech,new delay hits eu software laws a fresh delay ...,"[new, delay, hits, eu, software, laws, a, fres...","[new, delay, hit, eu, software, laws, fresh, d..."
2172,sport,middlesbrough 2-2 charlton a late header by te...,"[middlesbrough, 2-2, charlton, a, late, header...","[middlesbrough, charlton, late, header, teenag..."
2187,sport,jones files conte lawsuit marion jones has fil...,"[jones, files, conte, lawsuit, marion, jones, ...","[jones, file, conte, lawsuit, marion, jones, f..."
1878,sport,wales hails new superstar one game into his si...,"[wales, hails, new, superstar, one, game, into...","[wales, hail, new, superstar, one, game, six, ..."
1414,entertainment,doves soar to uk album summit manchester rock ...,"[doves, soar, to, uk, album, summit, mancheste...","[doves, soar, uk, album, summit, manchester, r..."
...,...,...,...,...
153,business,german business confidence slides german busin...,"[german, business, confidence, slides, german,...","[german, business, confidence, slide, german, ..."
1780,entertainment,hobbit picture four years away lord of the r...,"[hobbit, picture, four, years, away, lord, of,...","[hobbit, picture, four, years, away, lord, rin..."
866,entertainment,south bank awards honour hit soap coronation s...,"[south, bank, awards, honour, hit, soap, coron...","[south, bank, award, honour, hit, soap, corona..."
74,business,call to save manufacturing jobs the trades uni...,"[call, to, save, manufacturing, jobs, the, tra...","[call, save, manufacture, job, trade, union, c..."


In [39]:
df_train["clean_text"] = df_train.text_clean.apply(lambda x: " ".join(x))
df_test["clean_text"] = df_test.text_clean.apply(lambda x: " ".join(x))

In [41]:
vectorizer = TfidfVectorizer()

x_train = vectorizer.fit_transform(df_train.clean_text)
x_test = vectorizer.transform(df_test.clean_text)
y_train = df_train.category.tolist()
y_test = df_test.category.tolist()

In [46]:
lazy_text = LazyTextPredict(
    classification_type="multiclass",
    )
models = lazy_text.fit(x_train, x_test, y_train, y_test)

  0%|                                                                                                                                                                                                                  | 0/24 [00:00<?, ?it/s]

Processing AdaBoostClassifier estimator


  4%|████████▍                                                                                                                                                                                                 | 1/24 [00:01<00:42,  1.83s/it]

Processing BaggingClassifier estimator


  8%|████████████████▊                                                                                                                                                                                         | 2/24 [00:05<01:02,  2.83s/it]

Processing BernoulliNB estimator
Processing CalibratedClassifierCV estimator


 17%|█████████████████████████████████▋                                                                                                                                                                        | 4/24 [00:05<00:24,  1.22s/it]

Processing ComplementNB estimator
Processing DecisionTreeClassifier estimator


 25%|██████████████████████████████████████████████████▌                                                                                                                                                       | 6/24 [00:06<00:13,  1.32it/s]

Processing DummyClassifier estimator
Processing ExtraTreeClassifier estimator
Processing ExtraTreesClassifier estimator


 38%|███████████████████████████████████████████████████████████████████████████▊                                                                                                                              | 9/24 [00:08<00:09,  1.50it/s]

Processing GradientBoostingClassifier estimator


 46%|████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                            | 11/24 [00:47<01:23,  6.39s/it]

Processing KNeighborsClassifier estimator
Processing LinearSVC estimator


 50%|████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                    | 12/24 [00:47<00:58,  4.90s/it]

Processing LogisticRegression estimator


 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                            | 13/24 [00:50<00:48,  4.41s/it]

Processing LogisticRegressionCV estimator


 58%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                   | 14/24 [02:40<05:22, 32.29s/it]

Processing MLPClassifier estimator


 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                           | 15/24 [03:15<04:58, 33.21s/it]

Processing MultinomialNB estimator
Processing NearestCentroid estimator
Processing NuSVC estimator


 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                         | 19/24 [03:24<01:06, 13.35s/it]

Processing PassiveAggressiveClassifier estimator
Processing Perceptron estimator
Processing RandomForestClassifier estimator


 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                | 22/24 [03:25<00:13,  6.90s/it]

Processing RidgeClassifier estimator
Processing SGDClassifier estimator


 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 23/24 [03:25<00:05,  5.34s/it]

Processing SVC estimator


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [03:34<00:00,  8.93s/it]


 Label Analysis
| Classes             | Weights              |
|--------------------:|---------------------:|
| tech                | 0.8725490196078431   |
| politics            | 1.1528497409326426   |
| sport               | 1.0671462829736211   |
| entertainment       | 0.8708414872798435   |
| business            | 1.1097256857855362   |

 Result Analysis
| Model                         | Accuracy            | Balanced Accuracy   | F1 Score            | Custom Metric Score | Time Taken          |
| ----------------------------: | -------------------:| -------------------:| -------------------:| -------------------:| -------------------:|
| AdaBoostClassifier            | 0.7260479041916168  | 0.717737172132769   | 0.7248335989941609  | NA                  | 1.829047679901123   |
| BaggingClassifier             | 0.8817365269461078  | 0.8796633962363677  | 0.8814695332332374  | NA                  | 3.5215072631835938  |
| BernoulliNB                   | 0.9535928143712575  | 0.95




In [47]:
models[0]

{'name': 'AdaBoostClassifier',
 'accuracy': 0.7260479041916168,
 'balanced_accuracy': 0.717737172132769,
 'f1_score': 0.7248335989941609,
 'custom_metric_score': 'NA',
 'time': 1.829047679901123,
 'model': AdaBoostClassifier(),
 'confusion_matrix': array([[ 89,   5,  12,  35,   3],
        [  8,  58,   5,  44,   0],
        [  5,   2, 108,  10,   1],
        [  5,   7,   5, 138,   2],
        [ 25,   5,   1,   3,  92]]),
 'classification_report': '              precision    recall  f1-score   support\n\n           0       0.67      0.62      0.64       144\n           1       0.75      0.50      0.60       115\n           2       0.82      0.86      0.84       126\n           3       0.60      0.88      0.71       157\n           4       0.94      0.73      0.82       126\n\n    accuracy                           0.73       668\n   macro avg       0.76      0.72      0.72       668\nweighted avg       0.75      0.73      0.72       668\n'}

In [42]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()

clf.fit(x_train, y_train)

MultinomialNB()

In [43]:
preds = clf.predict(x_test)

In [44]:
preds

array(['tech', 'tech', 'tech', 'sport', 'tech', 'politics', 'business',
       'sport', 'sport', 'entertainment', 'sport', 'tech', 'politics',
       'sport', 'sport', 'business', 'politics', 'tech', 'business',
       'entertainment', 'entertainment', 'business', 'tech', 'sport',
       'tech', 'entertainment', 'sport', 'sport', 'business', 'sport',
       'sport', 'politics', 'sport', 'entertainment', 'tech', 'business',
       'tech', 'entertainment', 'entertainment', 'business', 'tech',
       'tech', 'sport', 'sport', 'tech', 'politics', 'sport', 'sport',
       'tech', 'politics', 'sport', 'entertainment', 'business',
       'business', 'business', 'entertainment', 'business', 'sport',
       'entertainment', 'business', 'politics', 'sport', 'entertainment',
       'entertainment', 'tech', 'politics', 'business', 'politics',
       'business', 'entertainment', 'business', 'entertainment',
       'business', 'sport', 'business', 'business', 'tech',
       'entertainment', 'busines

In [48]:
from sklearn.metrics import accuracy_score, classification_report
    
accuracy_score(y_test, preds)

0.9700598802395209

In [49]:
print(classification_report(y_test, preds))

               precision    recall  f1-score   support

     business       0.94      0.97      0.95       144
entertainment       1.00      0.92      0.96       115
     politics       0.95      0.98      0.96       126
        sport       1.00      1.00      1.00       157
         tech       0.97      0.98      0.97       126

     accuracy                           0.97       668
    macro avg       0.97      0.97      0.97       668
 weighted avg       0.97      0.97      0.97       668

