In [35]:
# Pandas and numpy for basic data manipulation
import pandas as pd
import numpy as np

# String and re (regular expression) for string/regex manipulations
import string
import re

# Counter for EDA (emojis)
from collections import Counter


# SciKit-Learn (sklearn) classes for model building, text vectorisation, and metrics for performance analysis
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold,RepeatedStratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, classification_report, precision_score, recall_score, log_loss, make_scorer,confusion_matrix
from sklearn.pipeline import Pipeline,FeatureUnion

#NLTK (Natural Language Tool Kit) and Spacy for NLP (Natural Language Processing)
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
import nltk
import spacy
#from spacy_langdetect import LanguageDetector #detects which language is used
import en_core_web_sm #Spacy English language dictionary

# NLTK dictionary Downloads
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Plotting/visualisation packages
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud

[nltk_data] Downloading package punkt to C:\Users\venetia
[nltk_data]     mokgawa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\venetia
[nltk_data]     mokgawa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\venetia
[nltk_data]     mokgawa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\venetia mokgawa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [36]:
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')

In [37]:
train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [38]:
test.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


In [39]:
print(f'The train dataset has {train.shape[0]} rows/entries and {train.shape[1]} columns ({train.columns[0]}, {train.columns[1]})')
print(f'The test dataset has {test.shape[0]} rows/entries and {test.shape[1]} columns ({test.columns[0]} and {test.columns[1]})')

The train dataset has 33000 rows/entries and 2 columns (lang_id, text)
The test dataset has 5682 rows/entries and 2 columns (index and text)


In [40]:
# Set the X and y variables to contain the feature ('message' column) and label ('sentiment' column), respectively.
X = train['text']
y = train['lang_id']

In [41]:
# Split X into a train and validation (test) set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

In [None]:
param_grid = {'alpha':[0.01,0.1,1]}
NB = Pipeline([('NB_tfidf', TfidfVectorizer(min_df=1,max_df=0.5,ngram_range=(6,6),analyzer='char')),
               ('NB_clf', GridSearchCV(MultinomialNB(),param_grid=param_grid,cv=5,n_jobs=-1))])

NB.fit(X_train, y_train)
y_pred_NB = NB.predict(X_test)


print(classification_report(y_test, y_pred_NB))
NB_metrics = classification_report(y_test, y_pred_NB, output_dict=True)

In [31]:
param_grid = {'alpha':[0.01,0.1,1]}
NB = Pipeline([('tfidf',TfidfVectorizer(min_df=1,max_df=0.5,ngram_range=(4,4),analyzer='char')),
              ('nb',GridSearchCV(MultinomialNB(),param_grid=param_grid,cv=5,n_jobs=-1))])
NB.fit(X_train, y_train)
y_pred_NB = NB.predict(X_test)


print(classification_report(y_test, y_pred_NB))
NB_metrics = classification_report(y_test, y_pred_NB, output_dict=True)

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       304
         eng       1.00      1.00      1.00       307
         nbl       1.00      1.00      1.00       274
         nso       1.00      1.00      1.00       318
         sot       1.00      1.00      1.00       264
         ssw       1.00      1.00      1.00       292
         tsn       1.00      1.00      1.00       315
         tso       1.00      1.00      1.00       296
         ven       1.00      1.00      1.00       334
         xho       1.00      1.00      1.00       292
         zul       1.00      1.00      1.00       304

    accuracy                           1.00      3300
   macro avg       1.00      1.00      1.00      3300
weighted avg       1.00      1.00      1.00      3300



In [None]:
param_grid = {'alpha':[0.01,0.1,1],
              'tfidf_use_idf': (True,False),
              'tfidf_norm': ('l1','l2'),
              'clf_alpha' : [1,1e-1,1e-2]}
NB = Pipeline([('tfidf',TfidfVectorizer(min_df=1,max_df=0.5,ngram_range=(3,3),analyzer='char')),
              ('nb',GridSearchCV(MultinomialNB,param_grid=param_grid,cv=5,n_jobs=-1))])
NB.fit(X_train, y_train)
y_pred_NB = NB.predict(X_test)


print(classification_report(y_test, y_pred_NB))
NB_metrics = classification_report(y_test, y_pred_NB, output_dict=True)

In [None]:
LSVC = Pipeline([('LSVC_tfidf', TfidfVectorizer(min_df=2,smooth_idf=True, ngram_range=(1, 3))),
                 ('LSVC_clf', LinearSVC())])

LSVC.fit(X_train, y_train)
y_pred_LSVC = LSVC.predict(X_test)

print(classification_report(y_test, y_pred_LSVC))
LSVC_metrics = classification_report(y_test, y_pred_LSVC)

In [None]:
Ridge = Pipeline([('Ridge_tfidf', TfidfVectorizer(min_df=2, smooth_idf=True, ngram_range=(1, 4))),
                 ('Ridge_clf', RidgeClassifier(alpha=0.2))])

Ridge.fit(X_train, y_train)
y_pred_Ridge = Ridge.predict(X_test)

print(classification_report(y_test, y_pred_Ridge))
Ridge_metrics = classification_report(y_test, y_pred_Ridge)

In [None]:
LSVC = Pipeline([('LSVC_vect', CountVectorizer()),
                 ('LSVC_clf', LinearSVC())])

LSVC.fit(X_train, y_train)
y_pred_LSVC = LSVC.predict(X_test)

print(classification_report(y_test, y_pred_LSVC))
LSVC_metrics = classification_report(y_test, y_pred_LSVC)

In [None]:
feature_union = ('feature_union', FeatureUnion([
                ('count_vect', CountVectorizer()),
                ('tfidf', TfidfVectorizer(min_df=2,sublinear_tf=True,norm='l2', smooth_idf=True, ngram_range=(1, 4))),
]))
    
feature = Pipeline([feature_union,
                        ('LSVC_clf', LinearSVC())])
feature.fit(X_train, y_train)
y_pred_feature= feature.predict(X_test)

print(classification_report(y_test, y_pred_feature))
feature_metrics = classification_report(y_test, y_pred_feature)

In [None]:
LSVC_param_grid = {'LSVC_clf__C': [1, 1.01, 1.02, 1.03],
                   'LSVC_tfidf__max_df': (0.9, 0.999),
                   'LSVC_tfidf__min_df': (0,0.00001, 0.001),
                   'LSVC_tfidf__ngram_range': [(1, 2), (1, 3), (1, 4), (1, 5)]}

skf = StratifiedKFold(n_splits=5, shuffle = True, random_state = 42)
# Using the Linear SVC model above, we perform the gridsearch
LSVC_searchCV = GridSearchCV(LSVC, cv=skf, param_grid=LSVC_param_grid, verbose=3, scoring='f1_micro', n_jobs=-1, refit=True)
LSVC_searchCV.fit(X, y)

In [None]:
NB = Pipeline([
               ('Count_vec',CountVectorizer(analyzer='char',min_df=1,ngram_range=(1,3))),
               ('NB_clf', MultinomialNB())])
naive_tfidf =Pipeline([('CountVec',  TfidfVectorizer()),('svc', MultinomialNB()),])
NB.fit(X_train, y_train)
y_pred_NB = NB.predict(X_test)


print(classification_report(y_test, y_pred_NB))
NB_metrics = classification_report(y_test, y_pred_NB, output_dict=True)

In [None]:
Ridge_param_grid = {'Ridge_clf__alpha':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                   'Ridge_tfidf__max_df': (0.9, 0.999),
                   'Ridge_tfidf__min_df': (0,0.00001, 0.001),
                   'Ridge_tfidf__ngram_range': [(1, 2), (1, 3), (1, 4), (1, 5)]}
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# Using the Ridge Classifier model above, we perform the gridsearch
Ridge_searchCV = GridSearchCV(Ridge, cv=cv, param_grid=Ridge_param_grid, n_jobs=-1, error_score=0)
Ridge_searchCV.fit(X, y)
#print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
#means = grid_result.cv_results_['mean_test_score']
#stds = grid_result.cv_results_['std_test_score']
#params = grid_result.cv_results_['params']
#for mean, stdev, param in zip(means, stds, params):
    #print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
LSVC_searchCV.best_params_

In [32]:
NB.fit(X, y)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(analyzer='char', max_df=0.5,
                                 ngram_range=(6, 6))),
                ('nb',
                 GridSearchCV(cv=5, estimator=MultinomialNB(), n_jobs=-1,
                              param_grid={'alpha': [0.01, 0.1, 1]}))])

In [33]:
#Make submission
Sub_df = pd.DataFrame(test['index'])
Sub_df['lang_id'] =NB.predict(test['text'])
Sub_df.to_csv('naive8', index=False)