<a href="https://colab.research.google.com/github/kavishajain5/Ineuron-Internship-/blob/main/news_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%writefile kaggle.json
{"username":"Kavisha23Jain","key":"*********************************"}

In [None]:
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download -c learn-ai-bbc
! unzip /content/learn-ai-bbc.zip

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, SpatialDropout1D
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score


In [None]:
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Downloading necessary NLTK resources
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:10]

nltk.download('wordnet')
nltk.download('omw-1.4')


In [None]:
df = pd.read_csv('/content/BBC News Train.csv')
df.head()

In [None]:
df.Category.value_counts()

In [None]:
df.isnull().sum()

EDA

In [None]:
categories = ['business', 'politics', 'entertainment', 'sport', 'tech']
df['Category'] = df['Category'].replace(categories, [0, 1, 2, 3, 4])

In [None]:
wc_stopwords = set(STOPWORDS)

business_data = df[df['Category'] == 0].Text
wordcloud = WordCloud(stopwords = wc_stopwords, max_words = 200, background_color = 'white', width = 1200, height = 800).generate(" ".join(business_data))
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.title('WordCloud for Business News Articles\n')
plt.axis('off')
plt.show()


In [None]:
education_data = df[df['Category'] == 1].Text
wordcloud = WordCloud(stopwords = wc_stopwords, max_words = 200, background_color = 'white', width = 1200, height = 800).generate(" ".join(education_data))
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.title('WordCloud for Polical News Articles\n')
plt.axis('off')
plt.show()


In [None]:
entertainment_data = df[df['Category'] == 2].Text
wordcloud = WordCloud(stopwords = wc_stopwords, max_words = 200, background_color = 'white', width = 1200, height = 800).generate(" ".join(entertainment_data))
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.title('WordCloud for Entertainment News Articles\n')
plt.axis('off')
plt.show()


In [None]:
sports_data = df[df['Category'] == 3].Text
wordcloud = WordCloud(stopwords = wc_stopwords, max_words = 200, background_color = 'white', width = 1200, height = 800).generate(" ".join(sports_data))
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.title('WordCloud for Sports News Articles\n')
plt.axis('off')
plt.show()


In [None]:
technology_data = df[df['Category'] == 4].Text
wordcloud = WordCloud(stopwords = wc_stopwords, max_words = 200, background_color = 'white', width = 1200, height = 800).generate(" ".join(technology_data))
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.title('WordCloud for Technology News Articles\n')
plt.axis('off')
plt.show()


Text preprocessing

In [None]:
def chars(text):
    return re.sub('[^A-Za-z0-9 ]+', '', text)

def decontractions(phrase):
    """Decontracted takes text and converts contractions into their natural form."""
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"won\’t", "will not", phrase)
    phrase = re.sub(r"can\’t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"n\’t", " not", phrase)
    phrase = re.sub(r"\’re", " are", phrase)
    phrase = re.sub(r"\’s", " is", phrase)
    phrase = re.sub(r"\’d", " would", phrase)
    phrase = re.sub(r"\’ll", " will", phrase)
    phrase = re.sub(r"\’t", " not", phrase)
    phrase = re.sub(r"\’ve", " have", phrase)
    phrase = re.sub(r"\’m", " am", phrase)
    return phrase

def stopwords(text):
    text = text.split(' ')
    output = [i for i in text if i not in stopwords]
    return ' '.join(output)
def stemming(text):
    text = text.split(' ')
    stem_text = [porter_stemmer.stem(word) for word in text]
    return ' '.join(stem_text)
def lemmatizer(text):
    text = text.split(' ')
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return ' '.join(lemm_text)

def preprocessing(text_df):
    text_df = text_df.apply(lambda x : chars(x))
    text_df = text_df.apply(lambda x : x.lower())
    text_df = text_df.apply(lambda x : decontractions(x))
    text_df = text_df.apply(lambda x: remove_stopwords(x))
    text_df = text_df.apply(lambda x: stemming(x))
    text_df = text_df.apply(lambda x: lemmatizer(x))
    return text_df


Model

In [None]:
process_train = preprocessing(X_train)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tf = vectorizer.fit_transform(process_train.values)
X_test_tf = vectorizer.transform(process_test.values)
X_train_tf

In [None]:
import pickle
# save the model to disk
filename = 'tfidf_vector.model'
pickle.dump(vectorizer, open(filename, 'wb'))

In [None]:
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components = 2, random_state = 0)
tsne_data = tsne_model.fit_transform(X_train_tf)

In [None]:
# create a dataframe from tsne
tsne_data = np.vstack((tsne_data.T, y_train)).T
tsne_df = pd.DataFrame(data = tsne_data,columns =("Dim_1", "Dim_2", "label"))

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y_train)
y_train_tr = le.transform(y_train)
y_test_tr = le.transform(y_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt_clf = DecisionTreeClassifier(random_state=0).fit(X_train_tf, y_train_tr)
print('Training score of model is',dt_clf.score(X_train_tf, y_train_tr))
y_pred = dt_clf.predict(X_test_tf)
print(classification_report(y_test_tr, y_pred, target_names=le.classes_))
sns.heatmap(confusion_matrix(y_test_tr, y_pred), annot=True, linewidths=.5)
plt.show()

In [None]:
clf = LogisticRegression(random_state=0).fit(X_train_tf, y_train_tr)
print('Training score of model is',clf.score(X_train_tf, y_train_tr))
y_pred = clf.predict(X_test_tf)
print(classification_report(y_test_tr, y_pred, target_names=le.classes_))
sns.heatmap(confusion_matrix(y_test_tr, y_pred), annot=True, linewidths=.5)
plt.show()

In [None]:
# ref : https://www.geeksforgeeks.org/saving-a-machine-learning-model/
import pickle
# save the model to disk
filename = 'logistic_regression.model'
pickle.dump(clf, open(filename, 'wb'))

# # load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
new_news = '''Wipro: The IT major reported a 21 per cent decline in its June quarter net profit as higher employee-related costs pushed up the firm's overall expenses. Consolidated net profit at Rs 2,563.6 crore in April-June was 20.6 per cent, lower than Rs 3,242.6 crore net profit in the same period a year back.
NTPC: The state-owned power giant has inked a pact with Moroccan Agency for Sustainable Energy (MASEN) for cooperation in renewable energy. It signed an MoU with MASEN for cooperation in the renewable energy sector during the 17th CII EXIM Conclave on India Africa Growth Partnership held in New Delhi from 19th-20th July 2022.'''

In [None]:
pre_news = all_preprocessing(pd.Series(new_news))
vec_news = vectorizer.transform(pre_news)
news_pred = clf.predict(vec_news)
print('New news classified as {0} category'.format(le.classes_[news_pred][0]))

In [None]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(token, handle, protocol=pickle.HIGHEST_PROTOCOL)

# # loading
# with open('tokenizer.pickle', 'rb') as handle:
#     token = pickle.load(handle)

In [None]:
from sklearn.preprocessing import LabelEncoder
lenc = LabelEncoder()
y_train_class = lenc.fit_transform(y_train)
y_test_class = lenc.transform(y_test)

y_train_class = to_categorical(y_train_class, num_classes=5)

In [None]:
embed_input = len(token.word_index.keys()) + 1

In [None]:
input = Input(shape=(max_len,))
embed = Embedding(embed_input, 100)(input)
lstm1 = LSTM(100, return_sequences=True, return_state=False)(embed)
lstm2 = LSTM(64, dropout=0.2)(lstm1)
dense1 = Dense(64)(lstm2)
drop1 = Dropout(0.3)(dense1)
dense2 = Dense(24)(drop1)
dense3 = Dense(5, activation='softmax')(dense2)

model = Model(inputs=input, outputs=dense3)

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
callback = EarlyStopping(monitor='val_accuracy', verbose=1, patience=3)

history = model.fit(data_xtrain_pad, y_train_class, validation_split=0.1, epochs=70, use_multiprocessing=True)

In [None]:
import matplotlib.pyplot as plt

accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(accuracy) + 1)

plt.figure(figsize=(20,5))
plt.subplot(121)
sns.lineplot(epochs, accuracy, label='Training accuracy')
sns.lineplot(epochs, val_accuracy, label='Validation accuracy')
plt.title('Training and validation accuracyuracy')
plt.legend()


plt.subplot(122)
sns.lineplot(epochs, loss, label='Training loss')
sns.lineplot(epochs, val_loss, label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

In [None]:
y_true = lenc.transform(y_test)
y_pred = model.predict(data_xtest_pad)
y_pred = np.argmax(y_pred,axis=1)

In [None]:
target_names = lenc.classes_
print(classification_report(y_true, y_pred, target_names=target_names))
sns.heatmap(confusion_matrix(y_true, y_pred), annot=True, linewidths=.5)
plt.show()

* We choose Logistic Regresion model to use while deployment