In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import string
import nltk
import seaborn as sns
from collections import Counter
from nltk.tokenize import  word_tokenize
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder

In [None]:
data = pd.read_csv('train.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
# Unique labels
unique = data['Label'].unique()
unique

In [None]:
len(unique)

### Clean Text Data

In [None]:
my_words = []
with open('malawi_stopwords.txt') as f:
    my_words = f.readlines()
    
my_words = [re.sub('\n', '',i) for i in my_words]  

In [None]:
def my_stopwords(text):
    hold = []
    hold.append(text)
    
    for i in hold:
        if i in my_words:
            hold.remove(i)
        else:
            pass
            
    hold = ' '.join(i for i in hold)
    return hold


In [None]:
def clean_text(data):
    hold = re.sub('\w*\d\w*', '', data) # remove numbers
    hold = hold.lower() # convert to lower case
    hold = re.sub('[%s]'%re.escape(string.punctuation),' ',str(hold)) # remove punctuations
    hold = hold.strip() # remove extra spaces
    
    return hold

In [None]:
clean_text = data['Text'].apply(clean_text)
clean_text = clean_text.apply(my_stopwords)

In [None]:
data['clean_text'] = clean_text

In [None]:
data.head()

In [None]:
print(f"This is the original text length: {len(data['Text'][0])}, This is the cleaned text length: {len(data['clean_text'][0])}")

### Visualize the target class

In [None]:
g = sns.countplot(x="Label", data=data)
plt.xticks(rotation=90)

In [None]:
y =  data['Label']

### Handling Imbalanced classes

In [None]:
le = LabelEncoder()

In [None]:
y = le.fit_transform(data['Label'])

In [None]:
data['enc_labels'] = y

### Method 1 for handling imbalanced classes

In [None]:
class_weights = class_weight.compute_class_weight('balanced', np.unique(data['enc_labels']), y)
d_class_weights = dict(enumerate(class_weights))

In [None]:
d_class_weights

### Method 2 for handling imbalanced classes

In [None]:
counter = Counter(y)
print(counter)

In [None]:
mapper = {}
for i in counter.most_common():
    mapper[i[0]] = i[1]

In [None]:
mapper

In [None]:
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

In [None]:
le_name_mapping

In [None]:
new_weights = {}

In [None]:
for i in unique:
    if mapper[i] < 100 and mapper[i] > 35:
        new_weights[le_name_mapping[i]] = 50
        
    elif mapper[i] > 100:
        new_weights[le_name_mapping[i]] = 1
    
    elif mapper[i] < 35:
        new_weights[le_name_mapping[i]] = 100
        


In [None]:
assert len(new_weights) == len(le_name_mapping)

In [None]:
new_weights

  ### Using TF-IDF Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(binary=True,norm=False,use_idf=False,smooth_idf=False,min_df=1,max_df=1.0,max_features=None,ngram_range=(1,1))
x = tf.fit_transform(data['clean_text'])
y = data['enc_labels']

In [None]:
# split the words
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.03, random_state = 3, stratify=y)

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
# Using logistic regression as Base Model.
from sklearn.linear_model import LogisticRegression
Logclassifier = LogisticRegression(class_weight=d_class_weights,random_state = 3)
Logclassifier.fit(x_train, y_train)

In [None]:
y_pred_log = Logclassifier.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score
score_1 = accuracy_score(y_test, y_pred_log)
score_1*100

### Applying k-fold cross validation

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold

In [None]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=3)

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = Logclassifier, X = x_train, y = y_train, cv = cv)

In [None]:
accuracies

In [None]:
accuracies.mean()

In [None]:
accuracies.std()

In [None]:
LogisticRegression?

In [None]:
# Applying grid search to find the best hyper-parameters

from sklearn.model_selection import GridSearchCV
parameters ={'C' : [1,10,100], 
             'solver':['liblinear','saga'], #'liblinear'         #,'newton-cg','lbfgs','sag','saga'
             'multi_class':['ovr','auto'], #'multinomial'
             'penalty':['l1']}#, 'l2','elasticnet'

grid_search = GridSearchCV(estimator = Logclassifier, 
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = cv)

grid_search = grid_search.fit(x_train, y_train)

In [None]:
best_accuracy = grid_search.best_score_
best_accuracy * 100 # Multiplied by 100 to get in percentage 

In [None]:
best_parameters = grid_search.best_params_
best_parameters

In [None]:
modelL = grid_search.best_estimator_

In [None]:
modelL.fit(x_train, y_train)

In [None]:
modelL.score(x_test, y_test)

In [None]:
logistic = grid_search.best_estimator_

In [None]:
logistic.fit(x,y)

### test data

In [None]:
test = pd.read_csv('test.csv')

In [None]:
test.shape

In [None]:
test.head()

In [None]:
def clean_text(data):
    cleaned = []
    for hold in data:
        hold = re.sub('\w*\d\w*', '', hold) # remove numbers
        hold = hold.lower() # convert to lower case
        hold = re.sub('[%s]'%re.escape(string.punctuation),' ',str(hold)) # remove punctuations
        hold = hold.strip() # remove extra spaces
        cleaned.append(hold)
    return cleaned

In [None]:
#clean_text = test['Text'].apply(clean_text)  <-- This raises an error

clean_text = clean_text(test['Text'])

In [None]:
cleaned = []
for text in clean_text:
    clean = my_stopwords(text)
    cleaned.append(clean)

In [None]:
test['clean_text'] = cleaned

In [None]:
test.head()

In [None]:
vec = tf.transform(test['clean_text'])

In [None]:
pred= logistic.predict(vec) 

In [None]:
test['Label'] = le.inverse_transform(pred)

In [None]:
test.head()

In [None]:
final =  test[['ID', 'Label']]

In [None]:
#final.to_csv('submission21.csv',index=False) # 1, 5, 6,8, 9, 10,11,15

#### Things to note: I tried various combinations of hyperparameters during the grid search but at the end, setting solver = 'saga', multi_class = 'ovr' and using the first method of handling imbalanced classes provided the best results

In [None]:
# https://zindi.africa/competitions/ai4d-malawi-news-classification-challenge/data

In [None]:
# https://towardsdatascience.com/getting-started-with-text-vectorization-2f2efbec6685