In [1]:
import pandas as pd
import numpy as np

## 1. Data Processing

In [2]:
train = pd.read_csv('Data\Train.csv')
train.head()

Unnamed: 0,ID,Text,Label
0,ID_AASHwXxg,Mwangonde: Khansala wachinyamata Akamati achi...,POLITICS
1,ID_AGoFySzn,MCP siidakhutire ndi kalembera Chipani cha Ma...,POLITICS
2,ID_AGrrkBGP,Bungwe la MANEPO Lapempha Boma Liganizire Anth...,HEALTH
3,ID_AIJeigeG,Ndale zogawanitsa miyambo zanyanya Si zachile...,POLITICS
4,ID_APMprMbV,Nanga wapolisi ataphofomoka? Masiku ano sichi...,LAW/ORDER


In [3]:
train.Label.value_counts()

POLITICS                279
SOCIAL                  152
RELIGION                147
LAW/ORDER               136
SOCIAL ISSUES           134
HEALTH                  127
ECONOMY                  86
FARMING                  78
SPORTS                   49
EDUCATION                43
RELATIONSHIPS            39
WILDLIFE/ENVIRONMENT     36
OPINION/ESSAY            26
LOCALCHIEFS              25
CULTURE                  23
WITCHCRAFT               16
MUSIC                    15
TRANSPORT                11
FLOODING                  7
ARTS AND CRAFTS           7
Name: Label, dtype: int64

In [4]:
X_train = train.Text
y_train = train.Label

In [5]:
test = pd.read_csv('Data\Test.csv')
test.head()

Unnamed: 0,ID,Text
0,ID_ADHEtjTi,Abambo odzikhweza akuchuluka Kafukufuku wa ap...
1,ID_AHfJktdQ,Ambuye Ziyaye Ayamikira Aphunzitsi a Tilitonse...
2,ID_AUJIHpZr,Anatcheleza: Akundiopseza a gogo wanga Akundi...
3,ID_AUKYBbIM,Ulova wafika posauzana Adatenga digiri ya uph...
4,ID_AZnsVPEi,"Dzombe kukoma, koma Kuyambira makedzana, pant..."


In [6]:
X_test = test.Text

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [12]:
vect.fit(X_train)

X_train_dtm = vect.transform(X_train)
X_test_dtm = vect.transform(X_test)

# 2. Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

logit = LogisticRegression(max_iter=2000)
logit.fit(X_train_dtm, y_train)

LogisticRegression(max_iter=2000)

### Model Evaluation

In [16]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(logit, X_train_dtm, y_train, cv=6, scoring='accuracy')
scores

array([0.59166667, 0.59166667, 0.64016736, 0.65271967, 0.62343096,
       0.62761506])

In [52]:
y_test_pred = logit.predict(X_test_dtm)

In [53]:
submission_logit = pd.DataFrame({'ID' : list(test.ID),
                                 'label' : list(y_test_pred)})

submission_logit.head()

Unnamed: 0,ID,label
0,ID_ADHEtjTi,SOCIAL ISSUES
1,ID_AHfJktdQ,RELIGION
2,ID_AUJIHpZr,RELATIONSHIPS
3,ID_AUKYBbIM,HEALTH
4,ID_AZnsVPEi,HEALTH


In [54]:
submission_logit.to_csv('Data\logistic_regression_submission.csv')

This had an accuracy score of .58 (139th as of 4/19/21). Not bad for not doing much feature engineering, let's try Naive Bayes.

## 3. Naive Bayes

In [18]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV

In [29]:
pipe = Pipeline(steps=[
                       ('vect', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                       ('naive_bayes', MultinomialNB())
])
pipe

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('naive_bayes', MultinomialNB())])

In [30]:
param_dict = {'vect__ngram_range': [(1, 1), (1, 2)],
              'vect__stop_words' : ['english',None], # Putting this in here in case there's some mixing of languages
              'tfidf__use_idf': (True, False),
              'naive_bayes__alpha' : [0.0001, 0.001, 0.01,0.1, 1]} 

In [31]:
grid = GridSearchCV(pipe, param_dict, cv=6, scoring='accuracy')
grid.fit(X_train, y_train)

GridSearchCV(cv=6,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('naive_bayes', MultinomialNB())]),
             param_grid={'naive_bayes__alpha': [0.0001, 0.001, 0.01, 0.1, 1],
                         'tfidf__use_idf': (True, False),
                         'vect__ngram_range': [(1, 1), (1, 2)],
                         'vect__stop_words': ['english', None]},
             scoring='accuracy')

In [32]:
grid.best_score_

0.6177039748953975

In [34]:
grid.best_params_

{'naive_bayes__alpha': 0.001,
 'tfidf__use_idf': True,
 'vect__ngram_range': (1, 2),
 'vect__stop_words': 'english'}

In [35]:
best_predictor = grid.estimator
best_predictor

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('naive_bayes', MultinomialNB())])

In [38]:
best_predictor.fit(X_train,y_train)
y_test_pred = best_predictor.predict(X_test)
y_test_pred[:5]

array(['POLITICS', 'RELIGION', 'POLITICS', 'POLITICS', 'POLITICS'],
      dtype='<U20')

In [44]:
list(y_test_pred)[:5]

['POLITICS', 'RELIGION', 'POLITICS', 'POLITICS', 'POLITICS']

In [45]:
list(test.ID)[:5]

['ID_ADHEtjTi', 'ID_AHfJktdQ', 'ID_AUJIHpZr', 'ID_AUKYBbIM', 'ID_AZnsVPEi']

In [49]:
submission_nb = pd.DataFrame({'ID' : list(test.ID),
              'label' : list(y_test_pred)})

In [50]:
submission_nb.to_csv('naive_bayes_submission.csv')

This only had an accuracy score of .25. Looking at the predictions, it appears this model overfit politics and religion.