In [86]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import warnings
warnings.simplefilter("ignore")
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import stopwordsiso
import unidecode
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
cv = CountVectorizer(ngram_range=(1,1))

In [87]:
df = pd.read_csv('12languages15million.csv') 

In [91]:
def preProcessing(data: pd.DataFrame):
    
    '''Returns X and y'''
    
    X = data["Text"]
    y = data["Language"]
    le = LabelEncoder()
    y = le.fit_transform(y)
    
    with open('label_encoder_eng_ind', 'wb') as fil:
        pickle.dump(le, fil)
    
    data_list = []
    
    for text in X:
        text = re.sub(r'[!@#$(),n"%^*.?:;~`0-9]', ' ', text)
        text = re.sub(r'[[]]', ' ', text)
        text = text.lower()
        data_list.append(text)
    
    return X, y

In [92]:
x, y = preProcessing(df)

In [98]:
print('--- Preprocessing is done! ---')

--- Preprocessing is done! ---


In [99]:
len(y_test)

150

In [93]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.00066, stratify=y)

In [94]:
def partialFit_predict(X, y, n_batches: int):
    
    print('--- Partial fit has begun! ---')
    '''Dividing the dataset into chunks before fitting'''
    
    def batches(l, n):
        for i in range(0, len(l), n):
            yield l[i:i+n]
    
    global cv
    i = 0
    for batch in batches(range(len(X)), n_batches):
        model = MultinomialNB()
        i = i + 1
        print(i)
        
        try:
            x = cv.fit_transform(X[batch[0]:batch[-1]+1]).toarray()
            print(X.shape)
            model.partial_fit(x, y[batch[0]:batch[-1]+1], classes=np.unique(y))
        except:
            x = cv.transform(X[batch[0]:batch[-1]+1]).toarray()
            model.partial_fit(x, y[batch[0]:batch[-1]+1], classes=np.unique(y))

    print('--- Training is done! ---\n\n')
    global y_test, x_test
    x_test = cv.transform(x_test).toarray()
    
    with open('count_vectorizer_eng_ind', 'wb') as f:
        pickle.dump(cv, f)
    
    y_pred = model.predict(x_test)
    
    with open('partial_fit_model_eng_ind', 'wb') as files:
        pickle.dump(model, files)
    
    ac = accuracy_score(y_test, y_pred)
    cr = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    print(f'''Accuracy score: {ac}\n\n''', cr)
    print(sns.heatmap(cm, annot=True))

In [95]:
partialFit_predict(x_train, y_train, 10000)

Accuracy score: 0.8333333333333334

               precision    recall  f1-score   support

           0       0.72      0.93      0.81        56
           1       1.00      0.70      0.82        47
           2       0.89      0.85      0.87        47

    accuracy                           0.83       150
   macro avg       0.87      0.83      0.84       150
weighted avg       0.86      0.83      0.83       150

[0 2 0 0 1 2 0 0 0 0 1 1 2 0 1 2 2 2 0 1 0 1 0 0 1 0 0 0 1 1 2 0 0 0 1 0 1
 1 0 2 2 2 0 0 2 2 0 2 0 0 2 0 0 2 2 2 2 0 2 0 0 1 1 2 0 1 0 0 0 2 0 0 0 0
 1 1 2 2 2 0 2 0 0 1 1 2 2 1 0 1 2 0 0 1 0 0 0 2 1 0 0 2 2 1 1 0 2 1 0 2 0
 0 2 0 0 1 0 0 2 1 2 2 0 0 0 0 0 2 1 2 1 0 0 2 0 2 1 0 1 0 2 0 0 2 0 0 1 2
 2 0]
