# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report as clrp
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
#importing dataset
dataset = pd.read_csv('train.csv')

In [3]:
#viewing 5 rows of dataset
dataset.head(5)

Unnamed: 0,Id,Title,Content,Label
0,227464,"Netflix is coming to cable boxes, and Amazon i...",if you subscribe to one of three rinky-dink (...,Entertainment
1,244074,"Pharrell, Iranian President React to Tehran 'H...","pharrell, iranian president react to tehran '...",Entertainment
2,60707,Wildlife service seeks comments,the u.s. fish and wildlife service has reopen...,Technology
3,27883,Facebook teams up with Storyful to launch 'FB ...,the very nature of social media means it is o...,Technology
4,169596,Caesars plans US$880 mln New York casino,caesars plans us$880 mln new york casino jul ...,Business


In [4]:
xtrain = dataset['Title']+dataset['Content']
ytrain = dataset['Label']
article=xtrain

# TEXT CLEANING 

To clean the text, the following steps are implemented:

- removing special characters and numbers
- changing all letters to lower cases
- removing stopwords
- lematization 



In [5]:
lemmatizer = WordNetLemmatizer() 
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
stop = stopwords.words('english3')

def lemmatize_text1(text):
    return [lemmatizer.lemmatize(w,pos="v") for w in w_tokenizer.tokenize(text)]
def lemmatize_text2(text):
    return [lemmatizer.lemmatize(w,pos="n") for w in text]

In [6]:
article= article.str.replace('[^\w\s]','') #remove punctuation
article= article.str.replace('\d+', '') #remove numbers 
article= article.apply(lambda x: " ".join(x.lower() for x in x.split())) #lowercase
article= article.apply(lemmatize_text1)
article= article.apply(lemmatize_text2)
article= article.apply(lambda x: " ".join(x for x in x if x not in stop)) #remove stopwords

In [7]:
labelencoder = LabelEncoder()
ytrain_matrix=labelencoder.fit_transform(ytrain)
np.unique(ytrain_matrix)
kf = KFold(n_splits=5)

# Bag of words

In [8]:
cv = CountVectorizer(max_features = 10000) #the number of words to be included in the sparse matrix can be mordified 
xtrain_matrix = cv.fit_transform(article)

## Random Forest

In [9]:
classifier = RandomForestClassifier(n_estimators=150)

In [10]:
for train_index, test_index in kf.split(xtrain_matrix,ytrain_matrix):
    classifier.fit(xtrain_matrix[train_index],ytrain_matrix[train_index])
    ypred=classifier.predict(xtrain_matrix[test_index])
    ytestt=ytrain_matrix[test_index]
    print(clrp(ytestt,ypred))

              precision    recall  f1-score   support

           0       0.92      0.91      0.91      4995
           1       0.96      0.98      0.97      8947
           2       0.97      0.91      0.94      2358
           3       0.93      0.93      0.93      6059

   micro avg       0.94      0.94      0.94     22359
   macro avg       0.94      0.93      0.94     22359
weighted avg       0.94      0.94      0.94     22359

              precision    recall  f1-score   support

           0       0.92      0.90      0.91      5029
           1       0.96      0.98      0.97      8878
           2       0.96      0.91      0.93      2419
           3       0.93      0.93      0.93      6033

   micro avg       0.94      0.94      0.94     22359
   macro avg       0.94      0.93      0.94     22359
weighted avg       0.94      0.94      0.94     22359

              precision    recall  f1-score   support

           0       0.91      0.90      0.91      4893
           1       0.

## SVM

In [11]:
classifier2 = LinearSVC(tol=1e-5) 

In [12]:
for train_index, test_index in kf.split(xtrain_matrix,ytrain_matrix):
    classifier2.fit(xtrain_matrix[train_index],ytrain_matrix[train_index])
    ypred=classifier2.predict(xtrain_matrix[test_index])
    ytestt=ytrain_matrix[test_index]
    print(clrp(ytestt,ypred))



              precision    recall  f1-score   support

           0       0.91      0.91      0.91      4995
           1       0.98      0.99      0.98      8947
           2       0.96      0.96      0.96      2358
           3       0.94      0.94      0.94      6059

   micro avg       0.95      0.95      0.95     22359
   macro avg       0.95      0.95      0.95     22359
weighted avg       0.95      0.95      0.95     22359

              precision    recall  f1-score   support

           0       0.91      0.92      0.91      5029
           1       0.98      0.98      0.98      8878
           2       0.96      0.95      0.96      2419
           3       0.93      0.93      0.93      6033

   micro avg       0.95      0.95      0.95     22359
   macro avg       0.95      0.95      0.95     22359
weighted avg       0.95      0.95      0.95     22359

              precision    recall  f1-score   support

           0       0.91      0.91      0.91      4893
           1       0.

# SVD

In [13]:
svd = TruncatedSVD(n_components = 3000,random_state=42) 

In [14]:
x_train = svd.fit_transform(xtrain_matrix)

In [15]:
x_train.shape

(111795, 1000)

## Random Forest

In [16]:
for train_index, test_index in kf.split(xtrain,ytrain_matrix):
    classifier.fit(x_train[train_index],ytrain_matrix[train_index])
    ypred=classifier.predict(x_train[test_index])
    ytestt=ytrain_matrix[test_index]
    print(clrp(ytestt,ypred))

              precision    recall  f1-score   support

           0       0.91      0.90      0.90      4995
           1       0.94      0.98      0.96      8947
           2       0.97      0.84      0.90      2358
           3       0.92      0.92      0.92      6059

   micro avg       0.93      0.93      0.93     22359
   macro avg       0.93      0.91      0.92     22359
weighted avg       0.93      0.93      0.93     22359

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      5029
           1       0.93      0.98      0.96      8878
           2       0.97      0.82      0.89      2419
           3       0.92      0.93      0.92      6033

   micro avg       0.93      0.93      0.93     22359
   macro avg       0.93      0.90      0.92     22359
weighted avg       0.93      0.93      0.93     22359

              precision    recall  f1-score   support

           0       0.89      0.89      0.89      4893
           1       0.

## SVM

In [17]:
for train_index, test_index in kf.split(xtrain,ytrain_matrix):
    classifier2.fit(x_train[train_index],ytrain_matrix[train_index])
    ypred=classifier2.predict(x_train[test_index])
    ytestt=ytrain_matrix[test_index]
    print(clrp(ytestt,ypred))



              precision    recall  f1-score   support

           0       0.91      0.91      0.91      4995
           1       0.98      0.98      0.98      8947
           2       0.96      0.94      0.95      2358
           3       0.93      0.94      0.93      6059

   micro avg       0.95      0.95      0.95     22359
   macro avg       0.95      0.94      0.94     22359
weighted avg       0.95      0.95      0.95     22359





              precision    recall  f1-score   support

           0       0.91      0.91      0.91      5029
           1       0.97      0.98      0.98      8878
           2       0.97      0.93      0.95      2419
           3       0.93      0.93      0.93      6033

   micro avg       0.95      0.95      0.95     22359
   macro avg       0.95      0.94      0.94     22359
weighted avg       0.95      0.95      0.95     22359





              precision    recall  f1-score   support

           0       0.92      0.89      0.91      4893
           1       0.98      0.98      0.98      8972
           2       0.95      0.95      0.95      2442
           3       0.92      0.94      0.93      6052

   micro avg       0.95      0.95      0.95     22359
   macro avg       0.94      0.94      0.94     22359
weighted avg       0.95      0.95      0.95     22359





              precision    recall  f1-score   support

           0       0.90      0.92      0.91      4889
           1       0.98      0.98      0.98      9120
           2       0.96      0.94      0.95      2351
           3       0.94      0.93      0.93      5999

   micro avg       0.95      0.95      0.95     22359
   macro avg       0.95      0.94      0.94     22359
weighted avg       0.95      0.95      0.95     22359





              precision    recall  f1-score   support

           0       0.91      0.91      0.91      5028
           1       0.98      0.98      0.98      8917
           2       0.96      0.94      0.95      2450
           3       0.93      0.93      0.93      5964

   micro avg       0.95      0.95      0.95     22359
   macro avg       0.95      0.94      0.94     22359
weighted avg       0.95      0.95      0.95     22359

