References

Ramen16 (Name not found). 2021. IMDBReview. https://github.com/Ramen16july/IMDBreview.

Susan Li. 2018. Develop a NLP Model in Python & Deploy It With Flask. https://towardsdatascience.com/develop-a-nlp-model-in-python-deploy-it-with-flask-step-by-step-744f3bdd7776.

Sai Durga Kamesh Kota. 2020. Deploying Flask application with ML Models on AWS EC2 Instance. https://medium.com/shapeai/deploying-flask-application-with-ml-models-on-aws-ec2-instance-3b9a1cec5e13.


# Imports & Installs

In [None]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.preprocessing import OneHotEncoder

# train_test
from sklearn.model_selection import train_test_split

# TF-idF
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk

nltk.download('omw-1.4')
nltk.download("punkt")
nltk.download("wordnet")
nltk.download('stopwords')

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import scipy.sparse as sparse

# LSD

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import hamming_loss
from sklearn import preprocessing
from sklearn.metrics import multilabel_confusion_matrix, classification_report



from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer

import tensorflow.keras as keras
from tensorflow.keras import layers
from tensorflow.keras import Sequential

import unicodedata

# flask deployment

import pickle
from flask import Flask,render_template,request,send_file,send_from_directory,jsonify
import zipfile
from zipfile import ZipFile


[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Pre-Processing unit

In [None]:
def strip_accents(text):
    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")

    return str(text)

def cleanPunc(sentence):
    cleaned = re.sub(r'[?|!|„|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

def stemming(sentence):
    stemmer = SnowballStemmer("english")
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        if (len(stem) > 2): # small edit
          stemSentence += stem
          stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence


def removeStopWords(sentence):
    global re_stop_words

    stop_words = set(stopwords.words('english'))
    stop_words.update(['zero','one','two',
                      'three','four','five',
                      'six','seven','eight',
                      'nine','ten','may',
                      'also','across','among',
                      'beside','however','yet',
                      'within','since'])

    re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)

    return re_stop_words.sub(" ", sentence)

def preprocessing(text):
  # just do everything in one function
  text = strip_accents(text)
  text = cleanPunc(text)
  text = removeStopWords(text)
  text = stemming(text)
  return text

# Read, process and one-hot encode

In [None]:
df = pd.read_csv('assm_4.csv')
df = df.dropna(axis = 0).drop('Unnamed: 0', axis = 1)

# One hot Encoding
df['topics'] = df['topics'].str.replace('[', '')
df['topics'] = df['topics'].str.replace(']', '')
df['topics'] = df['topics'].str.replace("' ", '')
df['topics'] = df['topics'].str.replace("'", '')

df_dummy = (df['topics'].str.replace(", ", ',')   # remove all spaces
    .str.get_dummies(',')            # get the dummies
)

df = pd.concat([df,df_dummy], axis = 1)

# Remove not necessary
df = df.drop('Archive', axis = 1)
df_dummy = df_dummy.drop('Archive',axis=1)

  df['topics'] = df['topics'].str.replace('[', '')
  df['topics'] = df['topics'].str.replace(']', '')


In [None]:
# so we don't run into any trouble
df["headline"] = df["headline"].astype(str)
df["body"] = df["body"].astype(str)

In [None]:
df["processed"] = df["body"].apply(lambda x : preprocessing(x))

# TF-IDF + SVD

In [None]:
tc = TfidfVectorizer( stop_words='english',
                      max_features= 1500, # found with experimentation
                      max_df = 0.75,
                      smooth_idf=True)
X = tc.fit_transform(df["processed"])

In [None]:
svd_model = TruncatedSVD(n_components = 500,
                         algorithm='randomized',
                         n_iter=100,
                         random_state=122)
X = svd_model.fit_transform(X)

# NN Model

In [None]:
model = Sequential()
model.add(layers.Dense(250, input_dim=X.shape[1], kernel_initializer='he_uniform',activation='relu'))
model.add(layers.Dense(150,activation='relu'))
model.add(layers.Dense(100,activation='relu'))
model.add(layers.Dense(50, activation='relu'))
model.add(layers.Dense(9,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 250)               125250    
                                                                 
 dense_1 (Dense)             (None, 150)               37650     
                                                                 
 dense_2 (Dense)             (None, 100)               15100     
                                                                 
 dense_3 (Dense)             (None, 50)                5050      
                                                                 
 dense_4 (Dense)             (None, 9)                 459       
                                                                 
Total params: 183,509
Trainable params: 183,509
Non-trainable params: 0
_________________________________________________________________


# Model Training

In [None]:
y = df_dummy.to_numpy()

In [None]:
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True)

In [None]:
history = model.fit(X_train, y_train, batch_size = 16, epochs = 10, validation_data = (X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
y_pred = (model.predict(X_test)).round()

print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test, y_pred)))
print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_test, y_pred, average='weighted')))

confusion = multilabel_confusion_matrix(y_test.astype(float).argmax(axis=1),
                                        y_pred.astype(float).argmax(axis=1))
print('Confusion Matrixes: \n')

for i, j in zip(df_dummy.columns, confusion):
  print('\n' + i + ':')
  print(j)



Accuracy: 0.55

Weighted Precision: 0.80
Weighted Recall: 0.69
Weighted F1-score: 0.73
Confusion Matrixes: 


Business:
[[148  27]
 [  3  28]]

Culture:
[[161   4]
 [ 12  29]]

Nature:
[[159   5]
 [ 17  25]]

Podcast:
[[153   6]
 [ 15  32]]

Politics:
[[161  17]
 [ 13  15]]

Sci&Tech:
[[185   4]
 [  3  14]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


# Model Dump

In [None]:
  # Dumping the model object to save it as model.pkl file

  pickle.dump(tc, open('tfidf.pkl', 'wb+'))
  pickle.dump(svd_model,open('model_svd.pkl','wb+'))
  #pickle.dump(model,open('model.pkl','wb+'))
  model.save('model.h5')

# Deployment

In [None]:
# html and style templates
ZipFile('templates.zip','r').extractall()
ZipFile('static.zip','r').extractall()

# init flask
app = Flask(__name__)

# load models
svd_model=pickle.load(open('model_svd.pkl','rb+'))
#model=pickle.load(open('model.pkl','rb+'))
tfidf =pickle.load(open('tfidf.pkl','rb+'))
model = load_model('model.h5')

# home page
@app.route('/')
def home():
	return render_template('home.html')

# predict
@app.route('/predict',methods=['POST'])
def predict():
  # categories for a good clean get
  columns = np.array(['Business', 'Culture', 'Nature', 'Podcast', 'Politics', 'Sci&Tech','Society', 'Sport', 'Travel'])

  if request.method == 'POST':
    # get user input
    s = request.form['message']

    # process and predict
    sample = [preprocessing(s)]
    X = tfidf.transform(sample)
    input = svd_model.transform(X)
    pred = model.predict(input)
    b = pred.round().astype(bool)[0]

    # get output ready
    if sum(b) > 0:
    	output = ""
    	for i in columns[b]:
    		output = output + " " + i
    else:
    	output = "This has no category"

  return render_template('result.html', prediction = output)

# run it
if __name__ == '__main__':
	app.run(host='0.0.0.0',port=8080)

Keras model archive loading:
File Name                                             Modified             Size
variables.h5                                   2023-02-28 18:40:40      2235164
metadata.json                                  2023-02-28 18:40:40           64
config.json                                    2023-02-28 18:40:40         2674
Keras weights file (<HDF5 file "variables.h5" (mode r)>) loading:
...layers
......dense
.........vars
............0
............1
......dense_1
.........vars
............0
............1
......dense_2
.........vars
............0
............1
......dense_3
.........vars
............0
............1
......dense_4
.........vars
............0
............1
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........13
.........14
.........15
.........16
.........17
.........18
.........19

 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat
