In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [48]:
news=pd.read_csv('"data/bbc-text.csv')

In [49]:
news.shape

(2635, 2)

In [50]:
from sklearn.utils import shuffle
news = shuffle(news)
news = news.reset_index(drop=True)

In [51]:
news.head()

Unnamed: 0,category,text
0,entertainment,a-listers flock to gervais sitcom hollywood ac...
1,entertainment,byrds producer melcher dies at 62 record produ...
2,politics,howard hits back at mongrel jibe michael howar...
3,entertainment,vera drake leads uk oscar hopes mike leigh s f...
4,tech,halo 2 heralds traffic explosion the growing p...


In [52]:
news=news.astype(str)
news.dropna()

Unnamed: 0,category,text
0,entertainment,a-listers flock to gervais sitcom hollywood ac...
1,entertainment,byrds producer melcher dies at 62 record produ...
2,politics,howard hits back at mongrel jibe michael howar...
3,entertainment,vera drake leads uk oscar hopes mike leigh s f...
4,tech,halo 2 heralds traffic explosion the growing p...
...,...,...
2630,business,yukos bankruptcy not us matter russian autho...
2631,business,newest eu members underpin growth the european...
2632,entertainment,west end to honour finest shows the west end i...
2633,business,russian oil merger excludes yukos the merger o...


In [53]:
news['text'] = news['text'].apply(lambda x: x.lower())
news.head()

Unnamed: 0,category,text
0,entertainment,a-listers flock to gervais sitcom hollywood ac...
1,entertainment,byrds producer melcher dies at 62 record produ...
2,politics,howard hits back at mongrel jibe michael howar...
3,entertainment,vera drake leads uk oscar hopes mike leigh s f...
4,tech,halo 2 heralds traffic explosion the growing p...


In [54]:
import string

def punctuation_removal(text):
    all_list = [char for char in text if char not in string.punctuation]
    clean_str = ''.join(all_list)
    return clean_str

news['text'] = news['text'].apply(punctuation_removal)

In [55]:
news.head()

Unnamed: 0,category,text
0,entertainment,alisters flock to gervais sitcom hollywood act...
1,entertainment,byrds producer melcher dies at 62 record produ...
2,politics,howard hits back at mongrel jibe michael howar...
3,entertainment,vera drake leads uk oscar hopes mike leigh s f...
4,tech,halo 2 heralds traffic explosion the growing p...


In [56]:
# Removing stopwords
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')

news['text'] = news['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [57]:
news.head()

Unnamed: 0,category,text
0,entertainment,alisters flock gervais sitcom hollywood actors...
1,entertainment,byrds producer melcher dies 62 record producer...
2,politics,howard hits back mongrel jibe michael howard s...
3,entertainment,vera drake leads uk oscar hopes mike leigh fil...
4,tech,halo 2 heralds traffic explosion growing popul...


In [58]:
# Function to plot the confusion matrix (code from https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html)
from sklearn import metrics
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [59]:
X_train,X_test,y_train,y_test = train_test_split(news['text'], news.category, test_size=0.2, random_state=42)

In [60]:
from sklearn.linear_model import LogisticRegression
from newspaper import*
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', LogisticRegression())])

# Fitting the model
model = pipe.fit(X_train, y_train)

# url="https://timesofindia.indiatimes.com/city/indore/indore-last-2-in-munawar-case-get-bail-after-8-weeks-in-jail/articleshow/81238874.cms"
# article = Article(str(url))
# article.download()
# article.parse()
# article.nlp()

# t1=tfidf_vectorizer([article.summary])
# p=model.predict([article.summary])
# print(p)

# Accuracy
prediction = model.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 98.29%


In [61]:
import pickle
final1 = 'model.sav'
pickle.dump(model, open('final1', 'wb'))