In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
train = pd.read_csv('Data\Train.csv')
train.head()

Unnamed: 0,ID,Text,Label
0,ID_AASHwXxg,Mwangonde: Khansala wachinyamata Akamati achi...,POLITICS
1,ID_AGoFySzn,MCP siidakhutire ndi kalembera Chipani cha Ma...,POLITICS
2,ID_AGrrkBGP,Bungwe la MANEPO Lapempha Boma Liganizire Anth...,HEALTH
3,ID_AIJeigeG,Ndale zogawanitsa miyambo zanyanya Si zachile...,POLITICS
4,ID_APMprMbV,Nanga wapolisi ataphofomoka? Masiku ano sichi...,LAW/ORDER


In [3]:
train.Label.value_counts()

POLITICS                279
SOCIAL                  152
RELIGION                147
LAW/ORDER               136
SOCIAL ISSUES           134
HEALTH                  127
ECONOMY                  86
FARMING                  78
SPORTS                   49
EDUCATION                43
RELATIONSHIPS            39
WILDLIFE/ENVIRONMENT     36
OPINION/ESSAY            26
LOCALCHIEFS              25
CULTURE                  23
WITCHCRAFT               16
MUSIC                    15
TRANSPORT                11
FLOODING                  7
ARTS AND CRAFTS           7
Name: Label, dtype: int64

## 1. Feature Engineering

In [4]:
# average word length
def avg_word_len(text):
    words = text.split()
    return np.mean([len(word) for word in words])

train['avg_word_len'] = train.Text.apply(avg_word_len)

In [5]:
train.head()

Unnamed: 0,ID,Text,Label,avg_word_len
0,ID_AASHwXxg,Mwangonde: Khansala wachinyamata Akamati achi...,POLITICS,6.903323
1,ID_AGoFySzn,MCP siidakhutire ndi kalembera Chipani cha Ma...,POLITICS,6.3
2,ID_AGrrkBGP,Bungwe la MANEPO Lapempha Boma Liganizire Anth...,HEALTH,6.248062
3,ID_AIJeigeG,Ndale zogawanitsa miyambo zanyanya Si zachile...,POLITICS,6.660907
4,ID_APMprMbV,Nanga wapolisi ataphofomoka? Masiku ano sichi...,LAW/ORDER,7.244444


In [6]:
# number of words
def num_words(text):
    return len(text.split())

train['num_words'] = train.Text.apply(num_words)
train.head()

Unnamed: 0,ID,Text,Label,avg_word_len,num_words
0,ID_AASHwXxg,Mwangonde: Khansala wachinyamata Akamati achi...,POLITICS,6.903323,331
1,ID_AGoFySzn,MCP siidakhutire ndi kalembera Chipani cha Ma...,POLITICS,6.3,370
2,ID_AGrrkBGP,Bungwe la MANEPO Lapempha Boma Liganizire Anth...,HEALTH,6.248062,129
3,ID_AIJeigeG,Ndale zogawanitsa miyambo zanyanya Si zachile...,POLITICS,6.660907,463
4,ID_APMprMbV,Nanga wapolisi ataphofomoka? Masiku ano sichi...,LAW/ORDER,7.244444,45


In [16]:
# proper nouns
def prop_nouns(text):
    proper_nouns= r'(([A-Z]([a-z]+|\.+))+(\s[A-Z][a-z]+)+)|([A-Z]{2,})|([a-z][A-Z])[a-z]*[A-Z][a-z]*'
    list_of_pns = re.findall(proper_nouns, text)
    return (([x[0] for x in list_of_pns if x]))


train['prop_nouns'] = train.Text.apply(lambda x: len([pn for pn in prop_nouns(x) if pn]))
train.head()   


Unnamed: 0,ID,Text,Label,avg_word_len,num_words,prop_nouns
0,ID_AASHwXxg,Mwangonde: Khansala wachinyamata Akamati achi...,POLITICS,6.903323,331,10
1,ID_AGoFySzn,MCP siidakhutire ndi kalembera Chipani cha Ma...,POLITICS,6.3,370,10
2,ID_AGrrkBGP,Bungwe la MANEPO Lapempha Boma Liganizire Anth...,HEALTH,6.248062,129,5
3,ID_AIJeigeG,Ndale zogawanitsa miyambo zanyanya Si zachile...,POLITICS,6.660907,463,13
4,ID_APMprMbV,Nanga wapolisi ataphofomoka? Masiku ano sichi...,LAW/ORDER,7.244444,45,0


## 2. Data Processing

In [23]:
X_train = train[['Text','avg_word_len','num_words','prop_nouns']]
y_train = train.Label

In [24]:
test = pd.read_csv('Data\Test.csv')
X_test = test.Text

## 3. Logistic Regression

In [27]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [40]:
col_transformer = ColumnTransformer(transformers=[
    ('vect', CountVectorizer(ngram_range=(1,4)),'Text'),
    ('passthrough', 'passthrough', ['avg_word_len', 'num_words','prop_nouns']),
    ('drop', 'drop',['Text'])
])

In [41]:
pipe = Pipeline(steps=[
    ('col_transformer', col_transformer),
    ('logit', LogisticRegression(max_iter=2000, n_jobs=-1))
])

# 

In [56]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('col_transformer',
                 ColumnTransformer(transformers=[('vect',
                                                  CountVectorizer(ngram_range=(1,
                                                                               4)),
                                                  'Text'),
                                                 ('passthrough', 'passthrough',
                                                  ['avg_word_len', 'num_words',
                                                   'prop_nouns']),
                                                 ('drop', 'drop', ['Text'])])),
                ('logit', LogisticRegression(max_iter=2000, n_jobs=-1))])

In [57]:
test['prop_nouns'] = test.Text.apply(lambda x: len([pn for pn in prop_nouns(x) if pn]))
test['num_words'] = test.Text.apply(num_words)
test['avg_word_len'] = test.Text.apply(avg_word_len)
test.head()   

Unnamed: 0,ID,Text,prop_nouns,num_words,avg_word_len
0,ID_ADHEtjTi,Abambo odzikhweza akuchuluka Kafukufuku wa ap...,9,346,6.182081
1,ID_AHfJktdQ,Ambuye Ziyaye Ayamikira Aphunzitsi a Tilitonse...,8,170,6.305882
2,ID_AUJIHpZr,Anatcheleza: Akundiopseza a gogo wanga Akundi...,4,435,6.170115
3,ID_AUKYBbIM,Ulova wafika posauzana Adatenga digiri ya uph...,19,727,6.441541
4,ID_AZnsVPEi,"Dzombe kukoma, koma Kuyambira makedzana, pant...",11,613,6.323002


In [60]:
X_test = test[['Text','avg_word_len','num_words','prop_nouns']]
X_test.head()

Unnamed: 0,Text,avg_word_len,num_words,prop_nouns
0,Abambo odzikhweza akuchuluka Kafukufuku wa ap...,6.182081,346,9
1,Ambuye Ziyaye Ayamikira Aphunzitsi a Tilitonse...,6.305882,170,8
2,Anatcheleza: Akundiopseza a gogo wanga Akundi...,6.170115,435,4
3,Ulova wafika posauzana Adatenga digiri ya uph...,6.441541,727,19
4,"Dzombe kukoma, koma Kuyambira makedzana, pant...",6.323002,613,11


In [61]:
y_test_pred = pipe.predict(X_test)

In [62]:
submission = pd.DataFrame({'ID' : list(test.ID),
                           'label' : list(y_test_pred)})

submission.to_csv('Data\submission.csv')

0.57 Accuracy that time.