<a href="https://colab.research.google.com/github/jakansha2001/Android_App_Projects/blob/master/Original_Dataset_Preprocessing_GPT2_BERT_XLNet_transformers2_2_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

GPT-2, BERT AND XLNet

#NLP model Functions

In [None]:
!pip install bert-extractive-summarizer



In [None]:
# !pip install transformers==4.5.0 # common errors : make sure that you are installing the latest version of the Transformers library
!pip install transformers==2.2.0



In [None]:
# !pip install spacy
!pip install spacy



In [None]:
from summarizer import Summarizer, TransformerSummarizer

GPT-2

In [None]:
GPT2_model = TransformerSummarizer(transformer_type="GPT2",transformer_model_key="gpt2-medium")

def gptSum(body):
  return ''.join(GPT2_model(body, min_length=60))


BERT

In [None]:
bert_model = Summarizer()

def bertSum(body):
  return ''.join(bert_model(body, min_length=60))


XLNet

In [None]:
model = TransformerSummarizer(transformer_type="XLNet",transformer_model_key="xlnet-base-cased")

def xlnetSum(body):
  return ''.join(model(body, min_length=60))


#Importing Covid article dataset csv file

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/jakansha2001/Datasets/master/covid19_articles1000.csv')

In [None]:
dataset = df['content'].tolist()

In [None]:
small_dataset = dataset[300:400]

Preprocessing the Data

In [None]:
import re
#import string

In [None]:
def preprocessor(text):
  text = text.lower()
  text = re.sub(re.compile('<.*?>'),'',text)
  text = re.sub('\[.*?\]','',text)
  #text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
  #text = re.sub('[''""...]','',text)
  text = re.sub('\n','',text)
  #text = re.sub('\w*\w*','',text)
  text = re.sub('\xa0','',text)
  text = re.sub('\u200b','',text)
  text = re.sub('http.*?','',text)
  text = re.sub(r'\\','',text)
  return text

processedArticles = []

for article in small_dataset:
    processedArticles.append(preprocessor(article))


In [None]:
processedArticles

['president donald trump said the u.s. has the outbreak of the coronavirus under control and has been briefed by the centers for disease control and prevention. speaking to cnbc, trump said he wasn\'t worried it would turn into a pandemic and said the only person infected had flown in from china. he repeated his view that the impeachment is a "hoax." trump batted away a question on whether the fed\'s balance sheet was the prime reason for the stock-market spx, +2.64% gains. he said fed interest rates should still go lower because the dollar dxy, -0.19% is strong.',
 'the outbreak of coronavirus in china has led to concerns over a slowdown in the nation’s economy and put pressure on oil and industrial metals prices on bets for potentially weaker demand for the commodities. however, if the 2002-2003 severe acute respiratory syndrome epidemic, known as sars, may serve as a guide, the overall impact of the virus outbreak may be modest and short lived on the global economy, as well as on th

Generating summaries

In [None]:
gptResult = []
bertResult = []
xlnetResult = []

In [None]:
i = 1
for article in processedArticles:
  gptResult.append(gptSum(article))
  bertResult.append(bertSum(article))
  xlnetResult.append(xlnetSum(article))
  print(i)
  i = i + 1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100


In [None]:
data={'gpt-summary':gptResult,'bert-summary':bertResult,'xlnet-summary':xlnetResult}
df = pd.DataFrame(data)
df.to_csv('bbc_result.csv', index=False)  

In [None]:
df.head()

Unnamed: 0,gpt-summary,bert-summary,xlnet-summary
0,has the outbreak of the coronavirus under cont...,has the outbreak of the coronavirus under cont...,has the outbreak of the coronavirus under cont...
1,the outbreak of coronavirus in china has led t...,the outbreak of coronavirus in china has led t...,the outbreak of coronavirus in china has led t...
2,burberry raised its revenue outlook for the ye...,burberry raised its revenue outlook for the ye...,burberry raised its revenue outlook for the ye...
3,"medical staff at a hospital in wuhan, china, w...","medical staff at a hospital in wuhan, china, w...","medical staff at a hospital in wuhan, china, w..."
4,global markets turned wobbly at the outbreak o...,global markets turned wobbly at the outbreak o...,global markets turned wobbly at the outbreak o...


#ROGUE AND BLEU SCORE CALCULATION

In [None]:
!pip install rouge-score



In [None]:
gptRouge = []
bertRouge = []
xlnetRouge = []

In [None]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

In [None]:
for i in range(0,100):
  gptRouge.append(scorer.score(processedArticles[i], gptResult[i]))
  bertRouge.append(scorer.score(processedArticles[i], bertResult[i]))
  xlnetRouge.append(scorer.score(processedArticles[i], xlnetResult[i]))

In [None]:
print('GPT Rouge Scores: ', gptRouge)
print('BERT Rouge Scores: ', bertRouge)
print('XLNet Rouge Scores: ', xlnetRouge)

GPT Rouge Scores:  [{'rouge1': Score(precision=1.0, recall=0.4230769230769231, fmeasure=0.5945945945945945), 'rougeL': Score(precision=1.0, recall=0.4230769230769231, fmeasure=0.5945945945945945)}, {'rouge1': Score(precision=1.0, recall=0.25, fmeasure=0.4), 'rougeL': Score(precision=1.0, recall=0.25, fmeasure=0.4)}, {'rouge1': Score(precision=1.0, recall=0.14878892733564014, fmeasure=0.25903614457831325), 'rougeL': Score(precision=1.0, recall=0.14878892733564014, fmeasure=0.25903614457831325)}, {'rouge1': Score(precision=1.0, recall=0.20458553791887124, fmeasure=0.3396778916544656), 'rougeL': Score(precision=1.0, recall=0.20458553791887124, fmeasure=0.3396778916544656)}, {'rouge1': Score(precision=1.0, recall=0.15733333333333333, fmeasure=0.271889400921659), 'rougeL': Score(precision=1.0, recall=0.15733333333333333, fmeasure=0.271889400921659)}, {'rouge1': Score(precision=1.0, recall=0.1833648393194707, fmeasure=0.3099041533546326), 'rougeL': Score(precision=1.0, recall=0.1833648393194

In [None]:
gptFmeasure = []
bertFmeasure = []
xlnetFmeasure = []

In [None]:
for i in range(0,100):
  gptFmeasure.append(gptRouge[i]['rougeL'].fmeasure)
  bertFmeasure.append(bertRouge[i]['rougeL'].fmeasure)
  xlnetFmeasure.append(xlnetRouge[i]['rougeL'].fmeasure)

In [None]:
from statistics import mean

print("Cumulative F Measure")
print("GPT-2 : ", round(mean(gptFmeasure), 6))
print("BERT : ", round(mean(bertFmeasure), 6))
print("XLNet : ", round(mean(xlnetFmeasure), 6))

Cumulative F Measure
GPT-2 :  0.344756
BERT :  0.354955
XLNet :  0.341528
