## Hackathon challenge

Will start by importing the required libraries to help in our text processing and loading datasets.

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from string import punctuation
import nltk
#nltk.download(['stopwords','punkt']) will pass stopwords to the tfidvectorizer
#from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix
from sklearn import metrics
from sklearn.model_selection import GridSearchCV


KeyboardInterrupt



##### 

# EDA (EXPLORATORY DATA ANALYSIS)

The training and testing data are loaded for use

In [None]:
train = pd.read_csv('./train_set.csv')
test = pd.read_csv('./test_set.csv')

Will check the content of both the training and testing data to have an overview of what will be dealing with

In [None]:
# first five features in the training deata
train.head(5)

In [None]:
#frist five features in the testing data
test.head(5)

Will then check the shape of our dataset. This helps in choosing the right model as some models might not do well small datasets while others might do well in large datasets

In [None]:
train.shape
test.shape

##### 

# Feature engeneering

We first start by defining a funtion to clean our text

In [5]:
def text_preprocessing(text):
    
    '''
    This function cleans text 
    '''
    
    text = text.lower() #to lower case
    text = text.replace('\n', ' ') # remove line breaks
    text = text.replace('\@(\w*)', '') # remove mentions
    text = re.sub(r"\bhttps://t.co/\w+", '', text) # remove URLs
    text = re.sub('\w*\d\w*', '', text) # remove numbers
    text = re.sub(r'\#', '', text) # remove hashtags. To remove full hashtag: '\#(\w*)'
    text = re.sub('\w*\d\w*', '', text) # removes numbers?
    text = re.sub(' +', ' ', text) # remove 1+ spaces
    text = re.sub("\n"," ",text)
    text =' '.join(text.split())

    return text

time: 0 ns (started: 2022-06-24 20:05:17 +01:00)


We apply the above funtion to both the train and testing datasets 

In [10]:
train['text'] = train['text'].apply(text_preprocessing)
test['text'] = test['text'].apply(text_preprocessing)

time: 9.92 s (started: 2022-06-24 20:05:18 +01:00)


In [11]:
# Replace '.txt' with 'text file'
train["text"] = train["text"].str.replace(".txt", " text file")
test["text"] = test["text"].str.replace(".txt", " text file")

  train["text"] = train["text"].str.replace(".txt", " text file")


time: 406 ms (started: 2022-06-24 20:05:28 +01:00)


  test["text"] = test["text"].str.replace(".txt", " text file")


##### 

# MODEL BUILDING

We separate our training dataset into X and Y awaiting model building

In [12]:
X = train['text']
y = train['lang_id']

time: 16 ms (started: 2022-06-24 20:05:28 +01:00)


We then split the data into the training and testing set

In [18]:
# Refining the train-test split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.01)#test size of 0.01 was used only fore the purpose of hackathon

time: 15 ms (started: 2022-06-24 23:26:47 +01:00)


We then fit our model and check the performance metrics

In [27]:
# Creating a pipeline for the gridsearch
param_grid = {'alpha': [0.1, 0.01,0.001]}  # alpha value of below 2 gives the best f1 score

tuned_mnb = Pipeline([('tfidf', TfidfVectorizer(min_df=2,
                                                max_df=0.9,
                                                ngram_range=(1, 1))),
                      ('mnb', GridSearchCV(MultinomialNB(),
                                           param_grid=param_grid,
                                           cv=6,
                                           scoring='f1_weighted'))
                      ])

tuned_mnb.fit(X_train, y_train)  # Fitting the model

y_pred_mnb = tuned_mnb.predict(X_val)  # predicting the fit on validation set

print(classification_report(y_val, y_pred_mnb))


              precision    recall  f1-score   support

         afr       1.00      1.00      1.00        26
         eng       1.00      1.00      1.00        23
         nbl       1.00      1.00      1.00        23
         nso       1.00      1.00      1.00        37
         sot       1.00      1.00      1.00        33
         ssw       1.00      1.00      1.00        31
         tsn       1.00      1.00      1.00        35
         tso       1.00      1.00      1.00        23
         ven       1.00      1.00      1.00        29
         xho       1.00      1.00      1.00        32
         zul       1.00      1.00      1.00        38

    accuracy                           1.00       330
   macro avg       1.00      1.00      1.00       330
weighted avg       1.00      1.00      1.00       330

time: 7.14 s (started: 2022-06-24 23:38:09 +01:00)


###### 

# MAKING SUBMISSION TO KAGGLE

In [28]:
submission_trial = pd.DataFrame(test['index'])
submission_trial['lang_id'] = tuned_mnb.predict(test['text'])
submission_trial.to_csv('hackathon.csv', index=False)

time: 407 ms (started: 2022-06-24 23:38:27 +01:00)
