In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
lemmatizer = WordNetLemmatizer() 
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
stop = stopwords.words('english3')

def lemmatize_text1(text):
    return [lemmatizer.lemmatize(w,pos="v") for w in w_tokenizer.tokenize(text)]
def lemmatize_text2(text):
    return [lemmatizer.lemmatize(w,pos="n") for w in text]

In [3]:
dataset1 = pd.read_csv('train.csv')
dataset2 = pd.read_csv('test.csv')
xtrain1 = dataset1['Title']+dataset1['Content']
xtrain2 = dataset2['Title']+dataset2['Content']
article=pd.concat([xtrain1,xtrain2])

In [4]:
article= article.str.replace('[^\w\s]','') #remove punctuation
article= article.str.replace('\d+', '') #remove numbers 
article= article.apply(lambda x: " ".join(x.lower() for x in x.split())) #lowercase
article= article.apply(lemmatize_text1)
article= article.apply(lemmatize_text2)
article= article.apply(lambda x: " ".join(x for x in x if x not in stop)) #remove stopwords

In [5]:
#tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(article[:111795])
xtrain=vectors[:111795]
ytrain=dataset1['Label']

In [6]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
ytrain_matrix=labelencoder.fit_transform(ytrain)

In [7]:
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report as clrp
kf = KFold(n_splits=5)
from sklearn.svm import LinearSVC
classifier2 = LinearSVC(random_state=0, tol=1e-5)

In [8]:
for train_index, test_index in kf.split(xtrain,ytrain_matrix):
    classifier2.fit(xtrain[train_index],ytrain_matrix[train_index])
    ypred=classifier2.predict(xtrain[test_index])
    ytestt=ytrain_matrix[test_index]
    print(clrp(ytestt,ypred))
classifier2.fit(xtrain,ytrain_matrix)

              precision    recall  f1-score   support

           0       0.95      0.95      0.95      4995
           1       0.99      0.99      0.99      8947
           2       0.98      0.98      0.98      2358
           3       0.96      0.97      0.97      6059

   micro avg       0.97      0.97      0.97     22359
   macro avg       0.97      0.97      0.97     22359
weighted avg       0.97      0.97      0.97     22359

              precision    recall  f1-score   support

           0       0.95      0.95      0.95      5029
           1       0.99      0.99      0.99      8878
           2       0.98      0.97      0.98      2419
           3       0.96      0.96      0.96      6033

   micro avg       0.97      0.97      0.97     22359
   macro avg       0.97      0.97      0.97     22359
weighted avg       0.97      0.97      0.97     22359

              precision    recall  f1-score   support

           0       0.95      0.95      0.95      4893
           1       0.

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=1e-05, verbose=0)

In [9]:
xtrain=vectorizer.transform(article[111795:])
dataset2['predicted']=classifier2.predict(xtrain)
dataset2['predicted']= dataset2['predicted'].map({0: 'Business', 1: 'Entertainment', 2: 'Health',3: 'Technology' })

In [10]:
dataset2.head(2)

Unnamed: 0,Id,Title,Content,predicted
0,262120,Tracy Morgan upgraded to fair condition after ...,actor and comedian tracy morgan has been upgr...,Entertainment
1,175132,Smartphones Weigh on Samsung Electronics as Gu...,samsung electronics co ltd on tuesday issued u...,Business


In [11]:
dataset2=dataset2.drop('Title',axis=1)
dataset2=dataset2.drop('Content',axis=1)

In [12]:
export_csv = dataset2.to_csv (r'C:\Users\Sarah\Desktop\export_dataframe2.csv', index = None, header=True)