In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
# import dataset 
train_df = pd.read_csv('train_set.csv')
test_df = pd.read_csv('test_set.csv')
test_df.reset_index(drop=True)

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.
...,...,...
5677,5678,You mark your ballot in private.
5678,5679,Ge o ka kgetha ka bowena go se šomiše Mofani k...
5679,5680,"E Ka kopo etsa kgetho ya hao ka hloko, hobane ..."
5680,5681,"TB ke bokudi ba PMB, mme Morero o tla lefella ..."


In [3]:
train_df.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [4]:
test_df.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


Check for any null values 

In [5]:
train_df.isnull().sum()

lang_id    0
text       0
dtype: int64

In [6]:
test_df.isnull().sum()

index    0
text     0
dtype: int64

Take a look at all the languages present in the train dataset

In [7]:
train_df["lang_id"].value_counts()

xho    3000
eng    3000
nso    3000
ven    3000
tsn    3000
nbl    3000
zul    3000
ssw    3000
tso    3000
sot    3000
afr    3000
Name: lang_id, dtype: int64

The train dataset contains 11 languages with 3000 sentences from each language. The dataset is therefore very balanced and has no missing values. The dataset is ready to be used to train a machine learning model.

In [8]:
X = np.array(train_df["text"])
y = np.array(train_df["lang_id"])


X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, 
                                                    random_state=42)

## Multinomial Naive Bayes 

In [9]:
mnb = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       984
         eng       1.00      1.00      1.00       991
         nbl       0.99      1.00      1.00       953
         nso       1.00      1.00      1.00      1026
         sot       1.00      1.00      1.00      1022
         ssw       1.00      1.00      1.00       998
         tsn       1.00      1.00      1.00       984
         tso       1.00      1.00      1.00       952
         ven       1.00      1.00      1.00      1034
         xho       1.00      1.00      1.00      1007
         zul       1.00      0.99      0.99       939

    accuracy                           1.00     10890
   macro avg       1.00      1.00      1.00     10890
weighted avg       1.00      1.00      1.00     10890



## Support Vector Machines (SVM)

In [10]:
svm = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                      ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, max_iter=5, random_state=42)),])
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       984
         eng       0.99      1.00      1.00       991
         nbl       0.99      0.97      0.98       953
         nso       1.00      0.99      0.99      1026
         sot       1.00      1.00      1.00      1022
         ssw       1.00      0.98      0.99       998
         tsn       0.99      1.00      0.99       984
         tso       0.99      1.00      0.99       952
         ven       1.00      1.00      1.00      1034
         xho       0.96      0.99      0.98      1007
         zul       0.98      0.95      0.96       939

    accuracy                           0.99     10890
   macro avg       0.99      0.99      0.99     10890
weighted avg       0.99      0.99      0.99     10890



## Model Tuning

In [12]:
parameters_mnb = {'vect__ngram_range': [(1, 1), (1, 2)],
                  'tfidf__use_idf': (True, False),
                  'clf__alpha': (1e-2, 1e-3),
}

mnb_gs = GridSearchCV(mnb, parameters_mnb, n_jobs=-1)
mnb_gs = mnb_gs.fit(X,y)

In [13]:
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
          'tfidf__use_idf': (True, False),
          'clf-svm__alpha': (1e-2, 1e-3),
}
svm_gs = GridSearchCV(svm, parameters_svm, n_jobs=-1)
gsvm_gs = svm_gs.fit(X, y)
svm_gs.best_score_
svm_gs.best_params_



{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}

## Submission

In [14]:
y_test = mnb_gs.predict(test_df['text'])
output = pd.DataFrame({'index': test_df.index+1,
                       'lang_id': y_test})
output.to_csv('Lista_Abutto_Submission.csv', index=False)
output

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,afr
...,...,...
5677,5678,eng
5678,5679,nso
5679,5680,sot
5680,5681,sot
