In [None]:
!pip install scikit-learn



In [24]:
import numpy as np
import pandas as pd
import csv

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer

import re
from string import punctuation

import warnings
warnings.filterwarnings("ignore")
np.random.seed(123)


In [25]:
train = pd.read_csv("train.csv",engine = "python", error_bad_lines=False)
test = pd.read_csv('test.csv',engine="python")

In [26]:
train.head()

Unnamed: 0,id,content,category
0,SW4670,"Bodi ya Utalii Tanzania (TTB) imesema, itafan...",uchumi
1,SW30826,"PENDO FUNDISHA-MBEYA RAIS Dk. John Magufuri, ...",kitaifa
2,SW29725,Mwandishi Wetu -Singida BENKI ya NMB imetoa ms...,uchumi
3,SW20901,"TIMU ya taifa ya Tanzania, Serengeti Boys jan...",michezo
4,SW12560,Na AGATHA CHARLES â€“Â DAR ES SALAAM ALIYEKUW...,kitaifa


In [27]:
test.head()

Unnamed: 0,id,content
0,SW4255,WAZIRI MKUU Kassim Majaliwa amep okea leseni ...
1,SW15677,RAIS John Magufuli amewataka viongozi wa Halm...
2,SW15925,"NEW YORK, MAREKANI MKALI wa hip hop nchini Mar..."
3,SW7615,"WAZIRI wa Kilimo, Dk Charles Tizeba amelitaka..."
4,SW28011,"Mwandishi wetu, Tanga WAFANYABIASHARA wa Mkoa ..."


In [28]:
#check mssing values in train and test
print( train.isnull().sum())
print(test.isnull().sum())

id          0
content     0
category    0
dtype: int64
id         0
content    0
dtype: int64


In [29]:
#check news category distribution
train.category.value_counts()

kitaifa      10242
michezo       6004
burudani      2229
uchumi        2028
kimataifa     1906
afya           859
Name: category, dtype: int64

Data prep

In [30]:
# mapping categorical values
categorical_mapping = {"kitaifa":0,"michezo":1, "burudani":2 , "kimataifa":3, "uchumi":4, "afya":5}

In [31]:
train['category'] = train.category.map(categorical_mapping)
train.head()

Unnamed: 0,id,content,category
0,SW4670,"Bodi ya Utalii Tanzania (TTB) imesema, itafan...",4
1,SW30826,"PENDO FUNDISHA-MBEYA RAIS Dk. John Magufuri, ...",0
2,SW29725,Mwandishi Wetu -Singida BENKI ya NMB imetoa ms...,4
3,SW20901,"TIMU ya taifa ya Tanzania, Serengeti Boys jan...",1
4,SW12560,Na AGATHA CHARLES â€“Â DAR ES SALAAM ALIYEKUW...,0


In [32]:
#function to clean data
def text_cleaning(text):
  text = re.sub(r"[^A-Za-z0-9]"," ", text)
  text = re.sub(r'\b\d+(?:\.\d+)\s+','',text)#remove numbers
  text = text.lower()
  text = ''.join([c for c in text if c not in punctuation ])
  return (text)

In [33]:
# clean train and test data
train["content"] = train["content"].apply(text_cleaning)
test["content"] = test["content"].apply(text_cleaning)

train.head()

In [34]:
train.head(20)

Unnamed: 0,id,content,category
0,SW4670,bodi ya utalii tanzania ttb imesema itafan...,4
1,SW30826,pendo fundisha mbeya rais dk john magufuri ...,0
2,SW29725,mwandishi wetu singida benki ya nmb imetoa ms...,4
3,SW20901,timu ya taifa ya tanzania serengeti boys jan...,1
4,SW12560,na agatha charles dar es salaam aliyekuw...,0
5,SW30734,majadiliano kati ya aliyekuwa mkurugenzi wa z...,0
6,SW28096,mwandishi wetu butiama mkuu wa majeshi mst...,0
7,SW1223,huenda manchester united ikasubiri ukaguzi w...,1
8,SW20534,shirika la bima la taifa nic linakuja na bi...,4
9,SW9193,rais wa zanzibar dk ali mohamed shein amewat...,0


In [35]:
X = train["content"]
y = train.category.values

In [36]:
#transform text data
vectorizer = CountVectorizer(lowercase=False)
vectorizer.fit(X)

X_transformed = vectorizer.transform(X)

test_transformed = vectorizer.transform(test["content"])

In [37]:
#split data into train and validate
X_train, X_valid,y_train, y_valid = train_test_split(X_transformed,y,test_size = 0.20, random_state=42,shuffle=True,stratify=y)

Create classifier

In [38]:
news_classifier = MultinomialNB()
news_classifier.fit(X_train,y_train)

MultinomialNB()

In [39]:
y_probas = news_classifier.predict_proba(X_valid)

In [40]:
log_loss(y_valid, y_probas)

4.24181756366122

In [41]:
#create preds from test data
test_probas = news_classifier.predict_proba(test_transformed)

Create Submission file