In [None]:
import numpy as np
import pandas as pd 

import pickle 
import xgboost

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


In [None]:
dataset = pd.read_csv("/home/hasan/Desktop/Code to keep on Github/Bengali news classification ML/dataset_final.csv")
dataset.head(3)

In [None]:
dataset.shape

### Feature and Label data

In [None]:
feature = []
label =[]

for i in range(0, len(dataset)):
    feature.append(str(dataset['content'][i]))
    label.append(str(dataset['category'][i])) 


In [None]:
# Length of the feature and label are
print("Length of the feature is {} and the length of the label is {}".format(len(feature), len(label)))

### Some Feature and Label

In [None]:
for fl in range(5):
    print(label[fl])
    print(feature[fl]) 

### Label Encoding

In [None]:
encoder = LabelEncoder()
encoded = encoder.fit_transform(label)

### One Hot Encoding

In [None]:
class_labels = encoded.reshape((encoded.shape[0], 1))

ohe = OneHotEncoder(sparse=False) 
y_ohe = ohe.fit_transform(class_labels)

In [None]:
y_ohe

### Tokenizing

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(feature)
X_token = tokenizer.texts_to_sequences(feature)


### Some Feature and Token

In [None]:
for i in range(3):
    print(feature[i])
    print(X_token[i])
    

### Pad Sequence

In [None]:
max_len = 300
X_pad = pad_sequences(X_token, padding='post', maxlen=max_len)

### Dividing Dataset

In [None]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=42)
sss.get_n_splits(X_pad, y_ohe)

for train_index, test_index in sss.split(X_pad, y_ohe):
    X_train, X_test = X_pad[train_index], X_pad[test_index]
    y_train, y_test = y_ohe[train_index], y_ohe[test_index]
    

In [None]:
print("Shape of X_train is {} Shape of y_train is {} Shape of X_test is {} Shape of y_test is {}".format(X_train.shape, y_train.shape, X_test.shape, y_test.shape))

### RandomForest Algorithm

In [None]:
rfc = RandomForestClassifier()

In [None]:
hyperparameter_grid = {'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
               'max_features': ['auto', 'sqrt'],
               'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4],
               'bootstrap': [True, False]}


In [None]:
random_cv = RandomizedSearchCV(estimator=rfc,
            param_distributions=hyperparameter_grid,
            cv=5, 
            n_iter=50,
            scoring = 'neg_mean_absolute_error',
            #n_jobs = 4,
            verbose = 5, 
            return_train_score = True,
            random_state=42)


In [None]:
random_cv.fit(X_train, y_train)


In [None]:
random_cv.best_estimator_

In [None]:
rfc = RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                       max_depth=90, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
rfc.fit(X_train, y_train)

In [None]:
#printing score of the model
rfc.score(X_test, y_test)

In [None]:
predicted = rfc.predict(X_test)


#### classification report

In [None]:
print(classification_report(y_test, predicted))


#### confusion matrix

In [None]:
oh_encod_inverse = ohe.inverse_transform(y_test)
label_encode_inverse = encoder.inverse_transform(oh_encod_inverse)
label_encode_inverse

In [None]:
#finally printing
plt.figure(figsize=(8,8))
sns.heatmap(confusion_matrix(label_encode_inverse, predicted), square=True, annot=True, fmt='d', cbar=False,
           #xticklabels=digits.target_names,
           #yticklabels=digits.target_names
           )

plt.xlabel('True label')
plt.ylabel('Predicted data')

#### Using cross_val_score

In [None]:
#cross_val_score in train data
cvs = cross_val_score(rfc, X_pad, y_ohe, cv=5)
cvs

In [None]:
cvs.mean()


### XGBoost Algorithm

In [None]:
classifier=xgboost.XGBClassifier()

In [None]:
hyperparameter_grid = {
    'n_estimators': [100, 500, 900, 1100, 1500],
    'max_depth':[2, 3, 5, 10, 15],
    'learning_rate':[0.05,0.1,0.15,0.20],
    'min_child_weight':[1,2,3,4],
    'booster':['gbtree','gblinear'],
    'base_score':[0.25,0.5,0.75,1]
    }


In [None]:
random_cv = RandomizedSearchCV(estimator=classifier,
            param_distributions=hyperparameter_grid,
            cv=5, 
            n_iter=50,
            scoring = 'neg_mean_absolute_error',
            n_jobs = 4,
            verbose = 5, 
            return_train_score = True,
            random_state=42)


In [None]:
random_cv.fit(X_train, y_train)

In [None]:
random_cv.best_estimator_

In [None]:
xgboost_classifier=xgboost.XGBClassifier(base_score=0.25, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=2,
              min_child_weight=1,
              n_estimators=900, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


In [None]:
xgboost_classifier.fit(X_train, y_train)

In [None]:
#printing score of the model
xgboost_classifier.score(X_test, y_test)

In [None]:
predicted = xgboost_classifier.predict(X_test)

#### classification report

In [None]:
print(classification_report(y_test, predicted))

#### confusion matrix

In [None]:
oh_encod_inverse = ohe.inverse_transform(y_test)
label_encode_inverse = encoder.inverse_transform(oh_encod_inverse)
label_encode_inverse

In [None]:
#finally printing
plt.figure(figsize=(8,8))
sns.heatmap(confusion_matrix(label_encode_inverse, predicted), square=True, annot=True, fmt='d', cbar=False,
           #xticklabels=digits.target_names,
           #yticklabels=digits.target_names
           )

plt.xlabel('True label')
plt.ylabel('Predicted data')


#### Using Cross_val_score

In [None]:
#cross_val_score in train data
cvs = cross_val_score(classifier, X_pad, y_ohe, cv=5)
cvs

In [None]:
cvs.mean()

### Model Saving

In [None]:
# save the model to disk
filename = 'News_Classification.sav'
pickle.dump(classifier, open(filename, 'wb'))

### Model Loading

In [None]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))