In [391]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.simplefilter("ignore")
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
tv = TfidfVectorizer(ngram_range=(3,3), analyzer='char')

In [None]:
df = pd.read_csv('train_data.csv')

In [393]:
# df.drop_duplicates(inplace=True)

In [394]:
df = df.sample(frac=1)

In [395]:
df.reset_index(drop=True, inplace=True)

In [236]:
def preProcessing(data: pd.DataFrame):
    
    '''Returns X and y'''
    
    X = data['Text']
    y = data["Language"]
    le = LabelEncoder()
    y = le.fit_transform(y)
    
    with open('Label_Encoder_ENG', 'wb') as files:
        pickle.dump(le, files)
    
    data_list = []
    
    for text in X:
        text = re.sub(r'[!@#$()~={}-<>/&*_\'"%,\^*.?:;~`0-9]', ' ', str(text))
        text = re.sub(r'[+-]', ' ', str(text))
        text = re.sub(r'[[]]', ' ', str(text))
        text = re.sub(r' +', ' ', str(text))
        text = text.lower()
        text = text.strip()
        data_list.append(text)
    data_list = pd.Series(data_list)   
    
    def removeNumeric(text):
        return ' '.join([token for token in text.split() if token.isalpha()])
    data_list = data_list.apply(lambda text: removeNumeric(text))
    
    return data_list, y

In [237]:
x_train, y_train = preProcessing(df)

In [None]:
print('--- Preprocessing is done! ---')

In [None]:
print(x_train[x_train['Language']=='English'])

In [38]:
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 10000, stratify=y)

In [40]:
def partialFit_predict(X, y, n_batches: int, laplace_smoothing_param: float):
    
    print('--- Partial training has begun! ---\n')
    '''Dividing the dataset into chunks before fitting'''
    
    def batches(l, n):
        for i in range(0, len(l), n):
            yield l[i:i+n]
            
    x_shape = int(X.shape[0])
    print(f'''Train Data shape: {x_shape}''')
    global tv
    i = 0
    for batch in batches(range(len(X)), n_batches):
        i+=1
        model = MultinomialNB(alpha=laplace_smoothing_param)  # 0 means if a given trigram is not present, then apply no smoothing
        print(f'''Dataset chunk number: {i}\n''') 
        x_shape-=n_batches
        print(f'''Train Data shape: {x_shape}\n''')
        try:
            x = tv.fit_transform(X[batch[0]:batch[-1]+1]).toarray()
            model.partial_fit(x, y[batch[0]:batch[-1]+1], classes=np.unique(y))
        except:
            x = tv.transform(X[batch[0]:batch[-1]+1]).toarray()
            model.partial_fit(x, y[batch[0]:batch[-1]+1], classes=np.unique(y))

    print('--- Training is done! ---\n\n')
#     global y_test, x_test
#     x_test = tv.transform(x_test).toarray()
    
    with open('tfidf_vectorizer_ENG', 'wb') as f:
        pickle.dump(tv, f)
    
    with open('NB_Model_ENG', 'wb') as files:
        pickle.dump(model, files)
    
#     y_pred = model.predict(x_test)
#     ac = accuracy_score(y_test, y_pred)
#     cr = classification_report(y_test, y_pred)
#     print(f'''Accuracy score: {ac}\n\n''', cr)

In [41]:
partialFit_predict(x_train, y_train, 10000, laplace_smoothing_param=0)

--- Partial training has begun! ---

Train Data shape: 80000
Dataset chunk number: 1

Train Data shape: 70000

Dataset chunk number: 2

Train Data shape: 60000

Dataset chunk number: 3

Train Data shape: 50000

Dataset chunk number: 4

Train Data shape: 40000

Dataset chunk number: 5

Train Data shape: 30000

Dataset chunk number: 6

Train Data shape: 20000

Dataset chunk number: 7

Train Data shape: 10000

Dataset chunk number: 8

Train Data shape: 0

--- Training is done! ---


Accuracy score: 0.8548

               precision    recall  f1-score   support

           0       0.82      0.91      0.86       905
           1       0.82      0.76      0.79       896
           2       0.74      0.79      0.76       910
           3       0.82      0.85      0.83       917
           4       0.87      0.86      0.86       917
           5       0.78      0.75      0.76       908
           6       0.84      0.86      0.85       917
           7       0.92      0.90      0.91       919
   