In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
!pip install sentence-transformers



In [4]:
import re
from sklearn.model_selection import train_test_split

In [5]:
pip install --upgrade sentence-transformers tqdm


Note: you may need to restart the kernel to use updated packages.


In [6]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='tqdm')


In [7]:
from tqdm import tqdm
from sentence_transformers import CrossEncoder


tqdm_notebook = tqdm


  from tqdm.autonotebook import tqdm, trange


# load the dataset

# step1:
Preprocessing and cleaning 

In [8]:
data= pd.read_csv("articles.csv")

In [9]:
data.head()

Unnamed: 0,Id,Heading,Article.Banner.Image,Outlets,Article.Description,Full_Article,Article_Type,Tonality
0,d6995462-5e87-453b-b64d-e9f1df6e94d2,"A Puzzling Maneuver, Then Freefall: NTSB Repor...",,Essex Caller,<p>The helicopter that crashed in Southeast Al...,<p>The helicopter that crashed in Southeast Al...,Commercial,Negative
1,8b05e939-a89e-4548-b92b-013822e8ee7d,Bell�s Nexus Air Taxi Concept Rings Changes Fo...,,Aviation Week Network,<p>A year after teasing the fledgling electric...,<p>A year after teasing the fledgling electric...,Commercial,Positive
2,69fcd400-bceb-4255-8277-619f2d68ac0b,Bell Helicopter Show Air Taxi Nexus,http://images.tmtpost.com/uploads/images/2019/...,TMTPost,<p>Bell released the full-size design of the v...,<p>Bell released the full-size design of the v...,Commercial,Positive
3,17943578-c11b-414b-b3f5-063d3a93157b,BELL D�VOILE LA CONCEPTION INT�GRALE DE SON TA...,http://www.fredzone.org/wp-content/uploads/201...,Fredzone,<p>Bell est une soci&eacute;t&eacute; am&eacut...,<p>Bell est une soci&eacute;t&eacute; am&eacut...,Commercial,Positive
4,f33c7b11-5f77-4a98-bb2e-d36689042aea,Les premiers retours d�Olivier Ezratty,,FrenchWeb,<p>It was still anecdotal to observe the explo...,<p>It was still anecdotal to observe the explo...,Commercial,Positive


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4305 entries, 0 to 4304
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Id                    4305 non-null   object
 1   Heading               4305 non-null   object
 2   Article.Banner.Image  1753 non-null   object
 3   Outlets               4305 non-null   object
 4   Article.Description   4305 non-null   object
 5   Full_Article          4305 non-null   object
 6   Article_Type          4305 non-null   object
 7   Tonality              3873 non-null   object
dtypes: object(8)
memory usage: 269.2+ KB


In [11]:
data.describe()

Unnamed: 0,Id,Heading,Article.Banner.Image,Outlets,Article.Description,Full_Article,Article_Type,Tonality
count,4305,4305,1753,4305,4305,4305,4305,3873
unique,4305,4020,1686,1762,4290,4304,7,3
top,49348418-84bf-4e74-bb23-ca718776853f,Boeing CEO: First Operational Self-Flying Cars...,http://5b0988e595225.cdn.sohucs.com/images/201...,WeChat,<p>Airbus Helicopters has delivered the first ...,<p>It stated that a �one-time visual inspectio...,Commercial,Positive
freq,1,8,3,208,2,2,2470,3286


In [12]:
data.count()

Id                      4305
Heading                 4305
Article.Banner.Image    1753
Outlets                 4305
Article.Description     4305
Full_Article            4305
Article_Type            4305
Tonality                3873
dtype: int64

In [13]:
data['Article_Type'].unique()

array(['Commercial', 'Military', 'Training', 'Executives', 'Others',
       'Financing', 'Support & Services'], dtype=object)

 Now Combining text columns into a single column and removing HTML tags, punctuation, etc  and Split the data into training and testing sets.

In [14]:
data['combinetext'] = data['Heading'] + ' ' + data['Article.Description'] + ' ' + data['Full_Article']


In [15]:
data['combinetext'] = data['combinetext'].apply(lambda x: re.sub('<.*?>', '', x))

Split the data into training and testing sets

In [16]:
X = data['combinetext']
y = data['Article_Type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
X_train.head()

1302    First Images of Prototype Marine One Helicopte...
1123    Hengyang people can take a helicopter to heave...
1321    Turkish military helicopter crashes on suburba...
3894    These Secret Helicopters Were Flown by a Shado...
1929    Static Display at MEBAA Show 2018 Announced Wh...
Name: combinetext, dtype: object

In [18]:
from sentence_transformers import SentenceTransformer

In [19]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [20]:
X_train_embeddings = model.encode(X_train.tolist(), show_progress_bar=True)
X_test_embeddings = model.encode(X_test.tolist(), show_progress_bar=True)

Batches:   0%|          | 0/108 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

MODEL SELECTION AND TUNING

In [21]:
from sklearn.linear_model import LogisticRegression

Train a Logistic Regression model

In [22]:
clf = LogisticRegression(max_iter=7000)
clf.fit(X_train_embeddings, y_train)


LogisticRegression(max_iter=7000)

In [23]:
from sklearn.model_selection import GridSearchCV

In [26]:
from sklearn.preprocessing import StandardScaler

In [27]:
scaler = StandardScaler()

In [29]:
from sklearn.pipeline import Pipeline

In [30]:
pipeline = Pipeline([
    ('scaler', scaler),
    ('clf', clf)
])


In [31]:
clf = LogisticRegression(max_iter=7000)

In [None]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear','sag', 'saga']
}

In [None]:
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_embeddings, y_train)



In [None]:
best_params = grid_search.best_params_

In [None]:
from sklearn.metrics import classification_report

In [None]:
y_pred = clf.predict(X_test_embeddings)


In [None]:
textoutput = classification_report(y_test, y_pred)


In [None]:
print(textoutput)

Model Saving and Reloading:

In [None]:
import joblib


In [None]:
joblib.dump(clf, 'text_classification_model.pkl')

In [None]:
loaded_model = joblib.load('text_classification_model.pkl')

Creating an API Endpoint using Flask:

In [None]:
from flask import Flask, request, jsonify
import joblib
from sentence_transformers import SentenceTransformer

In [None]:
app = Flask(__name__)

Load the model and the SentenceBERT model


In [None]:
model = joblib.load('text_classification_model.pkl')
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
def predict():
    data = request.json
    text = data['text']
    text_embedding = sbert_model.encode([text])
    prediction = model.predict(text_embedding)
    return jsonify({'Article_Type': prediction[0]})

In [None]:
if __name__ == '__main__':
