# Importing libraries 

In [1]:
# importing all the libraries. If any library is missing, then you can use "pip install library name". 
import numpy as np
import pandas as pd 
import os 
from sklearn.utils import shuffle
import nltk 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import PorterStemmer 
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report,accuracy_score,mean_squared_error,mean_absolute_error
from sklearn.model_selection import train_test_split

# Importing data 

In [26]:
direct = []
file = []
title = []
text = []
label = []
dpath = '/Users/kedarnandiwdekar/Desktop/datasets /' # saved directory path in a variable 
for dirname, _ , filenames in os.walk(dpath):
    try:
        filenames.remove('README.TXT') # checking if theres a read me file. If true, then deleting it.
    except:
        pass
    for filename in filenames:
        direct.append(dirname)
        file.append(filename)
        label.append(dirname.split('/')[-1])
        fullpathfile = os.path.join(dirname,filename)
        with open(fullpathfile, 'r', encoding="utf8", errors='ignore') as infile: # opening the file from list of files in read mode 
            intext = ''
            firstline = True
            for line in infile:
                if firstline:
                    title.append(line.replace('\n',''))
                    firstline = False
                else:
                    intext = intext + ' ' + line.replace('\n','')
            text.append(intext)
df = pd.DataFrame(list(zip(text, label)), 
               columns =['text', 'label'])



# Dividing the dataset into two subsets train, test

In [3]:
df = shuffle(df) # Basically will shuffle the dataset to reduce the bias. 
df['category_id'] = df['label'].factorize()[0] # factorize will give numeric values to categories. 
train = df.iloc[:1780,:] # This is to divide the dataset into 80:20
test = df.iloc[1781:2225,:]

# Feature engineering  

# Stopwords

In [4]:
word = set(stopwords.words('english')) 
stop = stopwords.words('english') # Basically, we will save all the stop words collection in a list
train['text'] = train['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) # we will match words in ur text to the ones from the list
train['text']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['text'] = train['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))


223     Scottish rock band Franz Ferdinand, shot promi...
1586    Thousands civil service jobs already cut moved...
567     US economic growth accelerated third quarter, ...
163     Oscar-nominated animation Shark Tale raked $80...
119     French booksellers braced rush interest anothe...
                              ...                        
1128    Graeme Souness believes Walter Smith would per...
1269    Hearts wrapped Scottish Cup quarter-final tie ...
1785    Labour's choice white candidate one UK's multi...
1693    Labour already broken pre-election promise imm...
331     Preview performances £3m musical Billy Elliot ...
Name: text, Length: 1780, dtype: object

# Stemming 

In [5]:
ps = PorterStemmer()
train['text'] = train['text'].apply(lambda x: ' '.join([ps.stem(word) for word in x.split()]))
train['text'] = train['text'].str.replace('[^\w\s]','')
train['text']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['text'] = train['text'].apply(lambda x: ' '.join([ps.stem(word) for word in x.split()]))
  train['text'] = train['text'].str.replace('[^\w\s]','')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['text'] = train['text'].str.replace('[^\w\s]','')


223     scottish rock band franz ferdinand shot promin...
1586    thousand civil servic job alreadi cut move lon...
567     us econom growth acceler third quarter help st...
163     oscarnomin anim shark tale rake 80m 424m first...
119     french booksel brace rush interest anoth book ...
                              ...                        
1128    graem souness believ walter smith would perfec...
1269    heart wrap scottish cup quarterfin tie livings...
1785    labour choic white candid one uk multiraci sea...
1693    labour alreadi broken preelect promis immigr i...
331     preview perform 3m music billi elliot delay gi...
Name: text, Length: 1780, dtype: object

# Lemmetization 

In [6]:
import numpy as np 
wordnet_lemm = WordNetLemmatizer()
lemm_words = np.vectorize(wordnet_lemm.lemmatize)
lemm_text = ' '.join(lemm_words(train['text']))
train['text']

223     scottish rock band franz ferdinand shot promin...
1586    thousand civil servic job alreadi cut move lon...
567     us econom growth acceler third quarter help st...
163     oscarnomin anim shark tale rake 80m 424m first...
119     french booksel brace rush interest anoth book ...
                              ...                        
1128    graem souness believ walter smith would perfec...
1269    heart wrap scottish cup quarterfin tie livings...
1785    labour choic white candid one uk multiraci sea...
1693    labour alreadi broken preelect promis immigr i...
331     preview perform 3m music billi elliot delay gi...
Name: text, Length: 1780, dtype: object

# Splitting words 

In [7]:
word_count = pd.Series(' '.join(train['text']).split()).value_counts()
word_count.sample(10)
word_count.shape

(27724,)

#  TFIDF 

In [8]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english', lowercase = True)
# With this method we will split our text into one and two words combination. 
# Depending upon their usage and importance they will be assigned a value.  
news = tfidf.fit_transform(train['text']).toarray()
ids = train['category_id']
print(news)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Feature selection using chi-square test 

In [9]:
from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import chi2
kbest = SelectKBest(score_func = chi2, k = 500 ) # Here using the chi-square test, we will determine which top 500 features will be retained. 
best_news = kbest.fit_transform(news, ids)
best_news

array([[0.0329645, 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       ...,
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ]])

In [10]:
text = train['text']
category = train['label']
text.head()

223     scottish rock band franz ferdinand shot promin...
1586    thousand civil servic job alreadi cut move lon...
567     us econom growth acceler third quarter help st...
163     oscarnomin anim shark tale rake 80m 424m first...
119     french booksel brace rush interest anoth book ...
Name: text, dtype: object

In [11]:
category

223     entertainment
1586         politics
567          business
163     entertainment
119     entertainment
            ...      
1128            sport
1269            sport
1785         politics
1693         politics
331     entertainment
Name: label, Length: 1780, dtype: object

# Splitting into train and test 

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(best_news,ids, test_size = 0.3, random_state = 60,shuffle=True)


# Model training and cross validation 

First we will use MultinomialNB model and train the model 

# MultinomialNB 

In [13]:
#Now we will first test MultinomialNB model without cross validation.
from sklearn.model_selection import StratifiedKFold  
from sklearn.metrics import confusion_matrix
multi = MultinomialNB()
multi.fit(X_train, Y_train) # fit method allows us to fit our data into the model 
Y_pred = multi.predict(X_test) # using predict() we will predict the dependent values for the corresponding independent variables.

In [14]:
from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_pred)) # This is used to give the classification report for the predicted values against the original values. 

              precision    recall  f1-score   support

           0       0.98      0.90      0.94        99
           1       0.98      0.97      0.98       101
           2       0.93      0.96      0.94       116
           3       0.97      0.99      0.98       128
           4       0.92      0.96      0.94        90

    accuracy                           0.96       534
   macro avg       0.96      0.95      0.96       534
weighted avg       0.96      0.96      0.96       534



Now we cross validate using k-fold cross validation technique

In [15]:
from sklearn.model_selection import cross_val_score
multi.fit(X_train, Y_train)
scores = cross_val_score(multi, X_train, Y_train, cv=10)
scores 



array([0.96      , 0.944     , 0.92      , 0.944     , 0.936     ,
       0.96      , 0.97580645, 0.9516129 , 0.95967742, 0.92741935])

In [16]:
scores.mean(), scores.std()

(0.9478516129032257, 0.0161057651533809)

# Decision Tree 

Now we will use cross validation with decision tree model 

In [17]:
from sklearn.tree import DecisionTreeClassifier  
dtree = DecisionTreeClassifier()
dtree.fit(X_train, Y_train)
Y_pred = dtree.predict(X_test)
print(classification_report(Y_test, Y_pred))
dtree.fit(X_train, Y_train)
scores1 = cross_val_score(dtree, X_train, Y_train, cv=10)
print("Now the scores after validation are: ",scores1) 
scores1.mean(), scores1.std()

              precision    recall  f1-score   support

           0       0.77      0.78      0.77        99
           1       0.82      0.88      0.85       101
           2       0.86      0.76      0.81       116
           3       0.91      0.90      0.90       128
           4       0.81      0.87      0.84        90

    accuracy                           0.84       534
   macro avg       0.83      0.84      0.83       534
weighted avg       0.84      0.84      0.84       534





Now the scores after validation are:  [0.84       0.864      0.88       0.832      0.848      0.872
 0.82258065 0.85483871 0.81451613 0.75806452]


(0.8385999999999999, 0.03345873940326765)

# Random Forest 

Finally we can cross validate for Random forest model 

In [18]:
from sklearn.ensemble import RandomForestClassifier   
rtree = RandomForestClassifier()
rtree.fit(X_train, Y_train)
Y_pred = rtree.predict(X_test)
print(classification_report(Y_test, Y_pred))
rtree.fit(X_train, Y_train)
scores2 = cross_val_score(rtree, X_train, Y_train, cv=10)
print("Now the scores after validation are: ",scores2) 
scores2.mean(), scores2.std()

              precision    recall  f1-score   support

           0       0.97      0.93      0.95        99
           1       0.95      0.93      0.94       101
           2       0.92      0.94      0.93       116
           3       0.96      0.99      0.98       128
           4       0.93      0.92      0.93        90

    accuracy                           0.95       534
   macro avg       0.95      0.94      0.94       534
weighted avg       0.95      0.95      0.95       534





Now the scores after validation are:  [0.96       0.968      0.92       0.984      0.984      0.952
 0.9516129  0.93548387 0.94354839 0.92741935]


(0.9526064516129032, 0.020825206850704017)

# Testing 

Now we test our model against test data set

In [23]:
# Finally we will test our model performance using testing data set. 
news_test = tfidf.fit_transform(test['text']).toarray()
ids_test = test['category_id']
news_train, news_test, ids_train, ids_test = train_test_split(news_test,ids_test, test_size = 0.3, random_state = 60,shuffle=True) 
# test-size denotes what percentage of data will be in the test set. 
multi.fit(news_train, ids_train)
ids_pred = multi.predict(news_test)
print(classification_report(ids_test,ids_pred))

              precision    recall  f1-score   support

           0       1.00      0.92      0.96        25
           1       0.96      0.88      0.92        26
           2       0.74      1.00      0.85        26
           3       0.96      1.00      0.98        24
           4       1.00      0.82      0.90        33

    accuracy                           0.92       134
   macro avg       0.93      0.92      0.92       134
weighted avg       0.93      0.92      0.92       134

