In [None]:
!pip install scikit-learn



In [299]:
import numpy as np
import pandas as pd
import csv

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import xgboost as xgb
from xgboost.sklearn import XGBClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator

import re
from string import punctuation

import warnings
warnings.filterwarnings("ignore")
np.random.seed(123)


In [240]:
train = pd.read_csv("train.csv")
test = pd.read_csv('test.csv',engine="python")
submission = pd.read_csv('sample_submission.csv')

In [224]:
train.head()

Unnamed: 0,id,content,category
0,SW4670,"Bodi ya Utalii Tanzania (TTB) imesema, itafan...",uchumi
1,SW30826,"PENDO FUNDISHA-MBEYA RAIS Dk. John Magufuri, ...",kitaifa
2,SW29725,Mwandishi Wetu -Singida BENKI ya NMB imetoa ms...,uchumi
3,SW20901,"TIMU ya taifa ya Tanzania, Serengeti Boys jan...",michezo
4,SW12560,Na AGATHA CHARLES – DAR ES SALAAM ALIYEKUWA K...,kitaifa


In [225]:
test.head()

Unnamed: 0,id,content
0,SW4255,WAZIRI MKUU Kassim Majaliwa amep okea leseni ...
1,SW15677,RAIS John Magufuli amewataka viongozi wa Halm...
2,SW15925,"NEW YORK, MAREKANI MKALI wa hip hop nchini Mar..."
3,SW7615,"WAZIRI wa Kilimo, Dk Charles Tizeba amelitaka..."
4,SW28011,"Mwandishi wetu, Tanga WAFANYABIASHARA wa Mkoa ..."


In [226]:
#check missing values in train and test
print( train.isnull().sum())
print(test.isnull().sum())

id          0
content     0
category    0
dtype: int64
id         0
content    0
dtype: int64


In [227]:
#check news category distribution
train.category.value_counts()

kitaifa      10242
michezo       6004
burudani      2229
uchumi        2028
kimataifa     1906
afya           859
Name: category, dtype: int64

Data prep

In [241]:
# mapping categorical values
categorical_mapping = {"kitaifa":0,"michezo":1, "burudani":2 , "kimataifa":3, "uchumi":4, "afya":5}

In [242]:
train['category'] = train.category.map(categorical_mapping)
train.head()

Unnamed: 0,id,content,category
0,SW4670,"Bodi ya Utalii Tanzania (TTB) imesema, itafan...",4
1,SW30826,"PENDO FUNDISHA-MBEYA RAIS Dk. John Magufuri, ...",0
2,SW29725,Mwandishi Wetu -Singida BENKI ya NMB imetoa ms...,4
3,SW20901,"TIMU ya taifa ya Tanzania, Serengeti Boys jan...",1
4,SW12560,Na AGATHA CHARLES – DAR ES SALAAM ALIYEKUWA K...,0


In [231]:
#clean data using gensim

import gensim.parsing.preprocessing as gsp
from gensim import utils

filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric,
          ]

def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

In [273]:
#cleaning train and test data
train["content"] = train["content"].apply(clean_text)
test["content"] = test["content"].apply(clean_text)

In [274]:
train.content

0         bodi ya utalii tanzania ttb imesema itafanya ...
1         pendo fundisha mbeya rais dk john magufuri am...
2        mwandishi wetu singida benki ya nmb imetoa msa...
3         timu ya taifa ya tanzania serengeti boys jana...
4         na agatha charles – dar es salaam aliyekuwa k...
                               ...                        
23263     alitoa pongezi hizo alipozindua rasmi hatua y...
23264     na nora damian dar es salaam tekla si jina la...
23265     mkuu wa mkoa wa njombe dk rehema nchimbi waka...
23266     mabingwa wa ligi kuu soka tanzania bara simba...
23267     wiki iliyopita nilianza makala haya yanayolen...
Name: content, Length: 23268, dtype: object

In [252]:
#Class to transform text to vectors

class Text2TfIdfTransformer(BaseEstimator):

    def __init__(self):
        self._model = TfidfVectorizer()
        pass

    def fit(self, df_x, df_y=None):
        #df_x = df_x.apply(lambda x : clean_text(x))
        self._model.fit(df_x)
        return self

    def transform(self, df_x):
        return self._model.transform(df_x)

In [275]:
tfidf = Text2TfIdfTransformer()
X_tf = tfidf.fit(train['content']).transform(train['content'])
test_tf = tfidf.transform(test['content'])
y = train.category.values

In [276]:
#split data into train and validate
X_train, X_valid,y_train, y_valid = train_test_split(X_tf, y, test_size = 0.20, random_state=42, shuffle=True, stratify=y)

In [263]:
print(test_tfid.shape)
print(X_tf.shape)

(7756, 113870)
(23268, 204675)


In [282]:
print(test_tf)

  (0, 201684)	0.02815686690854421
  (0, 194371)	0.025862685385040617
  (0, 194363)	0.04075921405940102
  (0, 194141)	0.03916814437101507
  (0, 194049)	0.03648872556929645
  (0, 187810)	0.1502422014964039
  (0, 187098)	0.02972718534819228
  (0, 186854)	0.13404281978769284
  (0, 185255)	0.058522614768177265
  (0, 182315)	0.06389305662757232
  (0, 176603)	0.10717250677740722
  (0, 166846)	0.01767196279266738
  (0, 166058)	0.09433531564869199
  (0, 164810)	0.10738506047622554
  (0, 164443)	0.061680426186475144
  (0, 162042)	0.05763558045887328
  (0, 159233)	0.2378941276422691
  (0, 157935)	0.045449521150703664
  (0, 157576)	0.0844449335892005
  (0, 154001)	0.03913881319748272
  (0, 150549)	0.04460704993103333
  (0, 147899)	0.0599854064532545
  (0, 147766)	0.08151404853538947
  (0, 138014)	0.07035424306806047
  (0, 137952)	0.07991463732525882
  :	:
  (7755, 12550)	0.02253118819310226
  (7755, 11649)	0.021603190359391018
  (7755, 10736)	0.01794589827275535
  (7755, 10338)	0.02087219269955350

# Create classifier

# Xgboost with stratified kfold

In [313]:
kfold = 10
skf = StratifiedKFold(n_splits=kfold, random_state=42)
params = {
    'min_child_weight': 10.0,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'max_delta_step': 1.8,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'eta': 0.025,
    'gamma': 0.65,
    'num_boost_round' : 700
    }
for i, (train_index, test_index) in enumerate(skf.split(X_tf, y)):
    print('[Fold %d/%d]' % (i + 1, kfold))
    X_train, X_valid = X_tf[train_index], X_tf[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    model = xgb.XGBClassifier(objective='multi:softprob', random_state=1 , max_de) # for multi-classes
    model.fit(X_train, y_train)
    y_probas = model.predict_proba(X_valid)
    print(log_loss(y_valid, y_probas))


[Fold 1/10]
Parameters: { params } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




KeyboardInterrupt: 

In [302]:
test_probas = model.predict_proba(test_tf)
submission_cols = ['kitaifa','michezo','burudani','uchumi','kimataifa','afya']
submission_df = pd.DataFrame(test_probas,columns=submission_cols)
submission_df['test_id'] = submission['test_id']
#rearrange
submission_df = submission_df[['test_id','kitaifa','michezo','burudani','uchumi','kimataifa','afya']]
submission_df.to_csv("xgb.csv",index = False)

# Multinomial NB with stratified

In [304]:
model = MultinomialNB()
for i, (train_index, test_index) in enumerate(skf.split(X_tf, y)):
    print('[Fold %d/%d]' % (i + 1, kfold))
    X_train, X_valid = X_tf[train_index], X_tf[test_index]
    y_train, y_valid = y[train_index], y[test_index]
     # for multi-classes
    model.fit(X_train, y_train)
    y_probas = model.predict_proba(X_valid)
    print(log_loss(y_valid, y_probas))
    
test_probas = model.predict_proba(test_tf)
submission_cols = ['kitaifa','michezo','burudani','uchumi','kimataifa','afya']
submission_df = pd.DataFrame(test_probas,columns=submission_cols)
submission_df['test_id'] = submission['test_id']
#rearrange
submission_df = submission_df[['test_id','kitaifa','michezo','burudani','uchumi','kimataifa','afya']]
submission_df.to_csv("nb.csv",index = False)

[Fold 1/5]
2.0166473999194228
[Fold 2/5]
1.9494018910307152
[Fold 3/5]
1.951628441208626
[Fold 4/5]
2.0284632784855186
[Fold 5/5]
1.9531699769873638


# Stacking

In [306]:
X_train, X_valid,y_train, y_valid = train_test_split(X_tf, y, test_size = 0.20, random_state=42, shuffle=True, stratify=y)

# Baseline

In [278]:
def nb_classifier(X_train, y_train, X_valid, y_valid, Test_tfid, name):
    news_classifier = MultinomialNB()
    news_classifier.fit(X_train, y_train)
    y_probas = news_classifier.predict_proba(X_valid)
    
    print(log_loss(y_valid, y_probas))
    
    #create preds from test data
    test_probas = news_classifier.predict_proba(Test_tfid)
    submission_cols = ['kitaifa','michezo','burudani','uchumi','kimataifa','afya']
    submission_df = pd.DataFrame(test_probas,columns=submission_cols)
    submission_df['test_id'] = submission['test_id']
    submission_df = submission_df[['test_id','kitaifa','michezo','burudani','uchumi','kimataifa','afya']]
    submission_df.to_csv(name,index = False)
    return(test_probas)
    

In [279]:
nb_classifier(X_train,y_train,X_valid, y_valid,test_tf,'sub2.csv')

1.993845557930163


array([[9.99201014e-01, 7.67397967e-04, 9.00407192e-07, 1.53701146e-06,
        2.90568789e-05, 9.34247606e-08],
       [9.99996095e-01, 3.67247735e-06, 6.90801989e-09, 3.33448624e-08,
        1.89362088e-07, 2.59094509e-09],
       [7.60591286e-01, 1.73993361e-01, 6.20299476e-02, 2.68954104e-03,
        5.03501512e-04, 1.92362957e-04],
       ...,
       [9.99931951e-01, 5.61590248e-05, 2.11980457e-07, 1.84053880e-07,
        1.13346890e-05, 1.59686407e-07],
       [9.98653135e-01, 1.33145705e-03, 4.50863511e-07, 8.60847173e-07,
        1.39116653e-05, 1.84722564e-07],
       [9.99999529e-01, 4.70244681e-07, 4.88134533e-11, 5.67862335e-10,
        2.58469442e-10, 2.22148084e-10]])

In [266]:
y_probas = news_classifier.predict_proba(X_valid)
log_loss(y_valid, y_probas)

2.020790397914459

In [268]:
test_probas = news_classifier.predict_proba(test_tf)


# Create Submission file

In [169]:
submission_cols = ['kitaifa','michezo','burudani','uchumi','kimataifa','afya']
submission_df = pd.DataFrame(test_probas,columns=submission_cols)
submission_df['test_id'] = submission['test_id']

In [170]:
#rearrange
submission_df = submission_df[['test_id','kitaifa','michezo','burudani','uchumi','kimataifa','afya']]
submission_df.to_csv("firsy.csv",index = False)

# Rough work

In [105]:
pl_xgb_tf_idf = Pipeline(steps=[('tfidf',Text2TfIdfTransformer()),
                         ('xgboost', xgb.XGBClassifier(objective='multi:softmax',max_depth=4,num_class=4))])
scores = cross_val_score(pl_xgb_tf_idf, X, y, cv=5)
print('Accuracy for Tf-Idf & XGBoost Classifier : ', scores.mean())

Accuracy for Tf-Idf & XGBoost Classifier :  0.8698645240544682


In [106]:

pl_random_forest_tf_idf = Pipeline(steps=[('tfidf',Text2TfIdfTransformer()),
                                   ('random_forest', RandomForestClassifier())])
scores = cross_val_score(pl_random_forest_tf_idf, X, y, cv=5,scoring='neg_log_loss')
print('Accuracy for Tf-Idf & RandomForest : ', scores.mean())

KeyboardInterrupt: 

In [99]:
y = train['category']
pl_log_reg_tf_idf = Pipeline(steps=[('tfidf',Text2TfIdfTransformer()),
                             ('random_forest', LogisticRegression(multi_class='multinomial', solver='saga', max_iter=100))])
scores = cross_val_score(pl_log_reg_tf_idf, X, y, cv=5,scoring='neg_log_loss')
print('log loss for Tf-Idf & Logistic Regression: ', scores.mean())

log loss for Tf-Idf & Logistic Regression:  -0.4194694855026995


In [212]:
#transform usinf tfidf
X = train["content"]
Test_tfid1 = test["content"]
y = train.category.values

tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X)

X_tf = tfidf_vect.transform(X)
Test_tfid = tfidf_vect.transform(Test_tfid1)