# Load Data

## Import Libraries
Import the required libraries.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
df_train = pd.read_csv("data/train_set.csv")
df_test = pd.read_csv("data/test_set.csv")

### Inspect DataFrames

In [3]:
df_train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [4]:
df_train.shape

(33000, 2)

In [5]:
df_test.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


In [6]:
df_test.shape

(5682, 2)

### Data Types

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33000 entries, 0 to 32999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   lang_id  33000 non-null  object
 1   text     33000 non-null  object
dtypes: object(2)
memory usage: 515.8+ KB


In [8]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5682 entries, 0 to 5681
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   index   5682 non-null   int64 
 1   text    5682 non-null   object
dtypes: int64(1), object(1)
memory usage: 88.9+ KB


### Missing Data

In [9]:
# Identify missing data function.
def total_missing(df, column_name):
    miss_entr = df[column_name].isnull().sum()
    return miss_entr

In [10]:
total_missing(df_train, df_train.columns)

lang_id    0
text       0
dtype: int64

In [11]:
total_missing(df_test, df_test.columns)

index    0
text     0
dtype: int64

### Unique Data

In [12]:
df_train.nunique()

lang_id       11
text       29948
dtype: int64

In [13]:
df_train["lang_id"].value_counts()

tsn    3000
tso    3000
nbl    3000
nso    3000
ssw    3000
zul    3000
eng    3000
sot    3000
ven    3000
xho    3000
afr    3000
Name: lang_id, dtype: int64

### All Lower Case

In [14]:
df_train["text"] = df_train["text"].str.lower()

In [15]:
def to_lower(df):
    df["text"] = df["text"].str.lower()
    return df

In [16]:
df_train = to_lower(df_train)
df_test = to_lower(df_test)

## Feature Engineering

### CountVectorizer

In [17]:
def feat_CountVec(df):
    vect = CountVectorizer()
    vect.fit(df["text"])
    vector_cv = vect.transform(df["text"])
    return vector_cv

In [18]:
def feat_CountVec(train, test):
    vect = CountVectorizer()
    vect.fit(train)
    train_cv = vect.transform(train)
    test_cv = vect.transform(test)
    return train_cv, test_cv

In [19]:
train, test = feat_CountVec(df_train["text"], df_test["text"])

### TfidfVectorizer

In [20]:
def feat_TfidfVec(df):
    vect = TfidfVectorizer()
    vect.fit(df["text"])
    vector_tv = vect.transform(df["text"])
    return vector_tv

In [21]:
# def feat_TfidfVec(train, test):
#     vect = TfidfVectorizer()
#     vect.fit(train)
#     train_cv = vect.transform(train)
#     test_cv = vect.transform(test)
#     return train_cv, test_cv

In [22]:
# train, test = feat_TfidfVec(df_train["text"], df_test["text"])

NameError: name 'TfidfVectorizer' is not defined

### Train Test Split

In [23]:
def train_split(df, X_feat):
    X = X_feat
    y = df["lang_id"]
    # split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return (X_train, y_train), (X_test, y_test)

In [24]:
(X_train, y_train), (X_test, y_test) = train_split(df_train, train)

# Modelling


## BernoulliNB

In [25]:
def accuracy(confusion_matrix):
   diagonal_sum = confusion_matrix.trace()
   sum_of_all_elements = confusion_matrix.sum()
   return diagonal_sum / sum_of_all_elements

In [26]:
# X = df_train["text"]
# y = df_train["lang_id"]

In [27]:
model_class = BernoulliNB()

In [28]:
params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0],
         }

In [29]:
model_rfc = GridSearchCV(model_class, params, n_jobs = -1, cv = 10)
# model_rfc = MultinomialNB()

In [31]:
# le = LabelEncoder()
# df_train["lang_id"] = le.fit_transform(df_train["lang_id"])

In [32]:
train, test = feat_CountVec(df_train["text"], df_test["text"])

In [33]:
(X_train, y_train), (X_test, y_test) = train_split(df_train, train)

In [34]:
model_rfc.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=BernoulliNB(), n_jobs=-1,
             param_grid={'alpha': [0.01, 0.1, 0.5, 1.0, 10.0]})

In [35]:
pred_lr = model_rfc.predict(X_test)

In [36]:
print('Classification Report')
print(classification_report(y_test, pred_lr))

Classification Report


NameError: name 'classification_report' is not defined

In [37]:
cm = confusion_matrix(pred_lr, y_test)
# Printing the accuracy
print("Accuracy of MLPClassifier : ", accuracy(cm))
# 0.9992424242424243

NameError: name 'confusion_matrix' is not defined

In [38]:
df_submit = subm_df(model_rfc, test, df_test[["index"]])

NameError: name 'subm_df' is not defined

In [171]:
write_submission(df_submit)

## MultinomialNB

In [404]:
model_class = MultinomialNB()

In [405]:
params = {
    
         }

In [406]:
model_rfc = GridSearchCV(model_class, params, n_jobs = -1, cv = 20)

In [407]:
train, test = feat_CountVec(df_train["text"], df_test["text"])

In [408]:
(X_train, y_train), (X_test, y_test) = tt_split(df_train, train)

In [409]:
model_gbc.fit(X_train, y_train)

BernoulliNB()

In [291]:
pred_lr = model_gbc.predict(X_test)

In [292]:
print('Classification Report')
print(classification_report(y_test, pred_lr))

Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       583
           1       1.00      1.00      1.00       615
           2       0.99      1.00      1.00       583
           3       1.00      1.00      1.00       625
           4       1.00      1.00      1.00       618
           5       1.00      1.00      1.00       584
           6       1.00      1.00      1.00       598
           7       1.00      1.00      1.00       561
           8       1.00      1.00      1.00       634
           9       1.00      1.00      1.00       609
          10       1.00      0.99      1.00       590

    accuracy                           1.00      6600
   macro avg       1.00      1.00      1.00      6600
weighted avg       1.00      1.00      1.00      6600



In [293]:
cm = confusion_matrix(pred_lr, y_test)
# Printing the accuracy
print("Accuracy of MLPClassifier : ", accuracy(cm))
# 0.9989393939393939

Accuracy of MLPClassifier :  0.9989393939393939


In [294]:
df_submit = subm_df(model_gbc, test, df_test[["index"]])

In [183]:
write_submission(df_submit)