In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [2]:
news=pd.read_csv('bbc-text.csv')

In [3]:
news.shape

(2225, 2)

In [4]:
from sklearn.utils import shuffle
news = shuffle(news)
news = news.reset_index(drop=True)

In [5]:
news.head()

Unnamed: 0,category,text
0,tech,gates opens biggest gadget fair bill gates has...
1,entertainment,briton wins short film oscar three of the five...
2,entertainment,glasgow hosts tsunami benefit gig the top name...
3,sport,moore questions captaincy brian moore believes...
4,politics,what really divides the parties so what is the...


In [6]:
news=news.astype(str)
news.dropna()

Unnamed: 0,category,text
0,tech,gates opens biggest gadget fair bill gates has...
1,entertainment,briton wins short film oscar three of the five...
2,entertainment,glasgow hosts tsunami benefit gig the top name...
3,sport,moore questions captaincy brian moore believes...
4,politics,what really divides the parties so what is the...
...,...,...
2220,sport,henman overcomes rival rusedski tim henman sav...
2221,business,china suspends 26 power projects china has ord...
2222,politics,kennedy criticises unfair taxes gordon brown...
2223,entertainment,wine comedy wins critics award quirky comedy ...


In [7]:
news['text'] = news['text'].apply(lambda x: x.lower())
news.head()

Unnamed: 0,category,text
0,tech,gates opens biggest gadget fair bill gates has...
1,entertainment,briton wins short film oscar three of the five...
2,entertainment,glasgow hosts tsunami benefit gig the top name...
3,sport,moore questions captaincy brian moore believes...
4,politics,what really divides the parties so what is the...


In [8]:
import string

def punctuation_removal(text):
    all_list = [char for char in text if char not in string.punctuation]
    clean_str = ''.join(all_list)
    return clean_str

news['text'] = news['text'].apply(punctuation_removal)

In [9]:
news.head()

Unnamed: 0,category,text
0,tech,gates opens biggest gadget fair bill gates has...
1,entertainment,briton wins short film oscar three of the five...
2,entertainment,glasgow hosts tsunami benefit gig the top name...
3,sport,moore questions captaincy brian moore believes...
4,politics,what really divides the parties so what is the...


In [10]:
# Removing stopwords
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')

news['text'] = news['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [11]:
news.head()

Unnamed: 0,category,text
0,tech,gates opens biggest gadget fair bill gates ope...
1,entertainment,briton wins short film oscar three five nomine...
2,entertainment,glasgow hosts tsunami benefit gig top names sc...
3,sport,moore questions captaincy brian moore believes...
4,politics,really divides parties gap labour tories nowad...


In [12]:
# Function to plot the confusion matrix (code from https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html)
from sklearn import metrics
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [13]:
X_train,X_test,y_train,y_test = train_test_split(news['text'], news.category, test_size=0.2, random_state=42)

In [18]:
from sklearn.linear_model import LogisticRegression
from newspaper import*
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', LogisticRegression())])

# Fitting the model
model = pipe.fit(X_train, y_train)

url="https://indianexpress.com/article/india/two-months-after-coup-india-attends-military-parade-in-myanmar-7250754/"
article = Article(str(url))
article.download()
article.parse()
article.nlp()

# t1=tfidf_vectorizer([article.summary])
# p=model.predict([article.summary])
# print(p)

# Accuracy
prediction = model.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 98.2%


In [22]:
import pickle
final1 = 'model.sav'
pickle.dump(model, open('final1', 'wb'))