In [1]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
np.random.seed(100)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [4]:
train = pd.read_csv('train.csv')

In [5]:
train['category'].unique()

array(['tech', 'business', 'sport', 'entertainment', 'politics'],
      dtype=object)

In [6]:
train

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
2220,business,cars pull down us retail figures us retail sal...
2221,politics,kilroy unveils immigration policy ex-chatshow ...
2222,entertainment,rem announce new glasgow concert us band rem h...
2223,politics,how political squabbles snowball it s become c...


In [7]:
train['text'] = [entry.lower() for entry in train['text']]

In [8]:
train['text']= [word_tokenize(entry) for entry in train['text']]

In [9]:
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV


In [10]:
word_Lemmatized = WordNetLemmatizer()


In [11]:
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

for index, entry in enumerate(train['text']):
    Final_words = []
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,pos = tag_map[tag[0]])
            Final_words.append(word_Final)
    train.loc[index,'text_final'] = str(Final_words)

In [12]:
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(train['text_final']).toarray()

In [30]:
#converting categorical into numerical with tech-0,business-1,sport-2,entertainment-3,politics-4
train['category'].replace(['tech', 'business','sport','entertainment','politics'],
                        [0, 1,2,3,4], inplace=True)

In [16]:
Y = train['category']

In [17]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X,Y,test_size=0.3,random_state = 40)

In [18]:
print(Train_X.shape)
print(Test_X.shape)
print(Train_Y.shape)
print(Test_Y.shape)

(1557, 5000)
(668, 5000)
(1557,)
(668,)


In [20]:
NB = naive_bayes.MultinomialNB()
NB.fit(Train_X,Train_Y)

# predict the labels on validation dataset
pred_NB = NB.predict(Test_X)

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(pred_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  97.75449101796407


In [21]:
print(classification_report(Test_Y,pred_NB))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       126
           1       0.99      0.97      0.98       147
           2       0.99      1.00      1.00       150
           3       0.98      0.96      0.97       110
           4       0.96      0.97      0.97       135

    accuracy                           0.98       668
   macro avg       0.98      0.98      0.98       668
weighted avg       0.98      0.98      0.98       668



In [22]:
SVM = svm.SVC(C = 1.0, kernel = 'linear', degree = 3)

In [23]:
SVM.fit(Train_X, Train_Y)

In [24]:
pred_SVM = SVM.predict(Test_X)

In [25]:
print("SVM Accuracy Score -> ",accuracy_score(pred_SVM, Test_Y)*100)

SVM Accuracy Score ->  98.05389221556887


In [26]:
print(classification_report(Test_Y,pred_SVM))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       126
           1       0.98      0.98      0.98       147
           2       0.99      1.00      0.99       150
           3       0.98      0.98      0.98       110
           4       0.97      0.96      0.97       135

    accuracy                           0.98       668
   macro avg       0.98      0.98      0.98       668
weighted avg       0.98      0.98      0.98       668



In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
classifier = RandomForestClassifier()
classifier.fit(Train_X, Train_Y)

# Evaluate model
predictions = classifier.predict(Test_X)
print("RandomForest Accuracy Score -> ",accuracy_score(predictions, Test_Y)*100)


RandomForest Accuracy Score ->  96.25748502994011


In [29]:
print(classification_report(Test_Y, predictions))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96       126
           1       0.92      0.99      0.96       147
           2       0.98      0.99      0.98       150
           3       0.96      0.95      0.96       110
           4       0.97      0.93      0.95       135

    accuracy                           0.96       668
   macro avg       0.96      0.96      0.96       668
weighted avg       0.96      0.96      0.96       668

