In [1]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import pickle

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv("articles.csv", encoding='latin-1')
data.head(5)

Unnamed: 0,Id,Heading,Article.Banner.Image,Outlets,Article.Description,Full_Article,Article_Type,Tonality
0,d6995462-5e87-453b-b64d-e9f1df6e94d2,"A Puzzling Maneuver, Then Freefall: NTSB Repor...",,Essex Caller,<p>The helicopter that crashed in Southeast Al...,<p>The helicopter that crashed in Southeast Al...,Commercial,Negative
1,8b05e939-a89e-4548-b92b-013822e8ee7d,Bells Nexus Air Taxi Concept Rings Changes Fo...,,Aviation Week Network,<p>A year after teasing the fledgling electric...,<p>A year after teasing the fledgling electric...,Commercial,Positive
2,69fcd400-bceb-4255-8277-619f2d68ac0b,Bell Helicopter Show Air Taxi Nexus,http://images.tmtpost.com/uploads/images/2019/...,TMTPost,<p>Bell released the full-size design of the v...,<p>Bell released the full-size design of the v...,Commercial,Positive
3,17943578-c11b-414b-b3f5-063d3a93157b,BELL DÉVOILE LA CONCEPTION INTÉGRALE DE SON TA...,http://www.fredzone.org/wp-content/uploads/201...,Fredzone,<p>Bell est une soci&eacute;t&eacute; am&eacut...,<p>Bell est une soci&eacute;t&eacute; am&eacut...,Commercial,Positive
4,f33c7b11-5f77-4a98-bb2e-d36689042aea,Les premiers retours dOlivier Ezratty,,FrenchWeb,<p>It was still anecdotal to observe the explo...,<p>It was still anecdotal to observe the explo...,Commercial,Positive


In [3]:
#check for nan values
print(data.isnull().values.any())

True


In [4]:
#count of nan
data.isnull().sum().sum()

2984

In [5]:
#we are gonna make use of only column Full_Article and Full_Article
print(data['Article_Type'].isnull().values.any())
print(data['Heading'].isnull().values.any())
print(data['Full_Article'].isnull().values.any())

False
False
False


In [6]:
new_data = data[['Article_Type','Full_Article']].copy()

In [7]:
# vectorization with SentenceBERT
sentence_embeddings_Article = model.encode(new_data['Full_Article'])

In [8]:
#sample
for sentence, embedding in zip(new_data['Full_Article'], sentence_embeddings_Article):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    break

Sentence: <p>The helicopter that crashed in Southeast Alaska in late September, killing three people, entered a 500-foot freefall before dropping to a Glacier Bay National Park beach, according to by the National Transportation Safety Board.&nbsp;The preliminary NTSB report released Friday offers no official probable cause. That determination won&lsquo;t be made until next year at the earliest.</p>
Embedding: [-3.00309900e-03  2.38306094e-02  2.55466979e-02  8.20990931e-03
  7.58432522e-02  3.03759594e-02 -4.55767214e-02  6.86481446e-02
 -1.21290544e-02  1.15365252e-01  6.37873337e-02 -2.67707137e-03
  1.21438403e-04 -9.44028869e-02  1.69798303e-02  2.01346260e-02
 -6.18147962e-02 -2.72768363e-02 -4.00289036e-02  4.95141260e-02
 -3.67529457e-03  2.97724307e-02 -2.14730594e-02  2.37362422e-02
  2.82483827e-02  4.85450104e-02 -2.91802455e-02 -1.53328776e-02
  4.60210405e-02  1.34227714e-02 -1.25043653e-02  3.15553546e-02
 -2.35974863e-02 -1.85018359e-03  3.93959507e-02  5.84680699e-02
 -

In [9]:
features = sentence_embeddings_Article
class_ = new_data['Article_Type']

# split into training and validation sets
X_tr, X_val, y_tr, y_val = train_test_split(features, class_, test_size=0.25, random_state=42)

print('X_tr shape:',X_tr.shape)

X_tr shape: (3228, 384)


In [10]:
print('X_val shape:',X_val.shape)

X_val shape: (1077, 384)


In [13]:
len(list(set(data['Article_Type'].tolist())))

7

In [14]:
neigh = KNeighborsClassifier(n_neighbors=7)

In [15]:
neigh.fit(X_tr, y_tr)

In [16]:
y_pred=neigh.predict(X_val)

In [17]:
print("Accuracy score:",accuracy_score(y_val, y_pred))

Accuracy score: 0.9229340761374187


In [18]:
# save the model
filename = "model.pickle"
# save model
pickle.dump(neigh, open(filename, "wb"))

In [19]:
# load model
loaded_model = pickle.load(open(filename, "rb"))

In [20]:
# new data test from the url: https://helihub.com/2018/10/22/1st-h125-delivered-with-blr-fastfin-kit/
input_article="The first H125 equipped with the “Supplemental Type Certificate” (STC) BLR FastFin directly installed by Airbus on the assembly line was delivered to Heli-Austria. Thanks’ to the partnership signed with BLR Aerospace, Airbus Helicopters is now proposing the BLR FastFin Tail Rotor Enhancement and Stability System as an option or a retrofit for its H125 aircraft. As an option on a new aircraft, the customer will benefit from an integrated solution with an aircraft immediately ready for operations at its delivery. The BLR FastFin is providing outstanding performances in high and hot conditions by reducing the power required by the tail rotor, both for flight efficiency and flight experience: Read more at https://helihub.com/2018/10/22/1st-h125-delivered-with-blr-fastfin-kit/"

In [21]:
test_vector=sentence_embeddings_Article = model.encode([input_article])
output_predict=loaded_model.predict(test_vector)

In [22]:
print("input:",input_article)
print("predicted artice type:",output_predict[0])

input: The first H125 equipped with the “Supplemental Type Certificate” (STC) BLR FastFin directly installed by Airbus on the assembly line was delivered to Heli-Austria. Thanks’ to the partnership signed with BLR Aerospace, Airbus Helicopters is now proposing the BLR FastFin Tail Rotor Enhancement and Stability System as an option or a retrofit for its H125 aircraft. As an option on a new aircraft, the customer will benefit from an integrated solution with an aircraft immediately ready for operations at its delivery. The BLR FastFin is providing outstanding performances in high and hot conditions by reducing the power required by the tail rotor, both for flight efficiency and flight experience: Read more at https://helihub.com/2018/10/22/1st-h125-delivered-with-blr-fastfin-kit/
predicted artice type: Commercial
