# Header Files

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from langdetect import detect
import re
import emoji
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# Making Groups of 10 Tweets of English Language

In [2]:
def ten_in_a_group(x,n):
    df = []
    count = 0
    tweets = ""

    for i in range(n):
        if(detect(x['text'][i])=='en'):
            count=count+1
            tweets = tweets+x['text'][i]
        if(count==10):
            count=0
            df.append(tweets)
            tweets=""
    return df

# Text Preprocessing

In [3]:
punctuations = "!”$%&’()*+-/:;<=>[]^_`{|}~•@#"

def remove_punctuations(x):
    return x.translate(str.maketrans('','',punctuations))

def remove_urls(x):
    return re.sub('http://\S+|https://\S+', '', x)

def remove_escape_sequences(x):
    return x.replace("\n","")

def remove_ampersand(x):
    return x.replace("&", "and")

def remove_hashtags(x):
    return re.sub('#[A-Za-z0-9]','',x)

In [4]:
def preprocessing(x,n):
    for i in range(n):
        x['text'][i] = remove_urls(x['text'][i])
        x['text'][i] = remove_punctuations(x['text'][i])
        x['text'][i] = remove_hashtags(x['text'][i])  
        x['text'][i] = remove_escape_sequences(x['text'][i])
        x['text'][i] = remove_ampersand(x['text'][i])
        x['text'][i] = emoji.demojize(x['text'][i])

## AYODHYAVERDICT

In [5]:
data_1 = pd.read_csv("#AYODHYAVERDICT_tweets.csv")
data_1

Unnamed: 0,user_id,status_id,created_at,screen_name,text,source,display_text_width,reply_to_status_id,reply_to_user_id,reply_to_screen_name,...,statuses_count,favourites_count,account_created_at,verified,profile_url,profile_expanded_url,account_lang,profile_banner_url,profile_background_url,profile_image_url
0,x1179280919263186944,x1193844270861410306,2019-11-11 10:53:49,Prakash72091622,"Ahead of #AyodhyaVerdict, appeal to everyone t...",Twitter Web App,140,,,,...,897,1241,2019-10-02 06:24:44,False,,,,,,http://pbs.twimg.com/profile_images/1179284199...
1,x1179280919263186944,x1193844191891087360,2019-11-11 10:53:30,Prakash72091622,We respect the Hon’ble Supreme Court’s #Ayodhy...,Twitter Web App,139,,,,...,897,1241,2019-10-02 06:24:44,False,,,,,,http://pbs.twimg.com/profile_images/1179284199...
2,x1113627067,x1193844257012011008,2019-11-11 10:53:46,warewolf999,This is the last prayer offered at #BabriMasji...,Twitter for Android,140,,,,...,105502,2628,2013-01-23 06:21:07,False,https://t.co/twxHxOtlG0,http://www.google.com,,https://pbs.twimg.com/profile_banners/11136270...,http://abs.twimg.com/images/themes/theme5/bg.gif,http://pbs.twimg.com/profile_images/1000440223...
3,x1027888044307701761,x1193844250963628032,2019-11-11 10:53:44,Aayush74670776,“गिरा अरथ जल बीचि सम कहिअत भिन्न न भिन्न। \nबं...,Twitter for Android,138,,,,...,1007,788,2018-08-10 12:03:01,False,,,,,,http://pbs.twimg.com/profile_images/1027888890...
4,x762122872173191168,x1193844250661638145,2019-11-11 10:53:44,akmohanty07,"""There is a profound reason Maharshi Valmiki t...",Twitter for iPhone,140,,,,...,16755,57989,2016-08-07 03:07:07,False,,,,,,http://pbs.twimg.com/profile_images/7665359093...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,x179003453,x1193841884524949504,2019-11-11 10:44:20,chennak418,As a step to impress Sonia Gandhi&amp; Congres...,Twitter for Android,144,,,,...,24448,27433,2010-08-16 07:13:47,False,,,,,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1133588918...
196,x1095730132898246661,x1193841865541345280,2019-11-11 10:44:15,anu04706249,"By #AYODHYAVERDICT,ALL pending issues like-#Tr...",Twitter Web App,144,,,,...,133,19,2019-02-13 17:03:16,False,,,,,,http://abs.twimg.com/sticky/default_profile_im...
197,x21191906,x1193841865134465024,2019-11-11 10:44:15,UnityInDiverse,கோவில் கொள்ளைக்காரர்களின் கூடாரம்! ~ கலைஞர்\n\...,Twitter for Android,140,,,,...,65343,14349,2009-02-18 12:43:11,False,,,,https://pbs.twimg.com/profile_banners/21191906...,http://abs.twimg.com/images/themes/theme4/bg.gif,http://pbs.twimg.com/profile_images/1014359338...
198,x719049254774247424,x1193841829747146752,2019-11-11 10:44:07,MALLERSOWMYA,Just bcoz BJP got 303 seats in LS everything l...,Twitter for Android,144,,,,...,138538,139304,2016-04-10 06:27:56,False,,,,https://pbs.twimg.com/profile_banners/71904925...,,http://pbs.twimg.com/profile_images/8692152758...


In [6]:
df_1 = ten_in_a_group(data_1,data_1.shape[0])

In [7]:
df_1 = pd.DataFrame(df_1, columns=['text'])

In [8]:
preprocessing(df_1,df_1.shape[0])

In [9]:
df_1.shape

(12, 1)

## INDvAUS

In [10]:
data_2 = pd.read_csv("#INDvAUS.csv")

In [11]:
data_2

Unnamed: 0.1,Unnamed: 0,text
0,0,#INDvAUS #IndvsAus #BorderGavaskarTrophy2023 h...
1,1,Scenes in 1st inning of 3rd test\n#indvaus #IN...
2,2,#INDvAUS https://t.co/E9kuupAiau
3,3,#INDvAUS https://t.co/O1doRsFIDc
4,4,@GBNEWS @joshxhowie @LeoKearse The #quran lite...
...,...,...
997,997,I think KL Rahul was acting as sink of Bad Luc...
998,998,IND vs AUS: मैथ्यू हेडन ने लाइव कमेंट्री के दौ...
999,999,#INDvAUS के बीच चल रही #BorderGavaskarTrophy20...
1000,1000,@Brad_Hogg Can anyone give stats on which coun...


In [12]:
df_2 = ten_in_a_group(data_2,data_2.shape[0])

In [13]:
df_2 = pd.DataFrame(df_2, columns=['text'])

In [14]:
preprocessing(df_2,df_2.shape[0])

In [15]:
df_2.shape

(78, 1)

## Adani

In [16]:
data_3 = pd.read_csv("#Adani.csv")
data_3

Unnamed: 0.1,Unnamed: 0,text
0,0,#harshrajput #adani #DATING 4/4 thread\n#salon...
1,1,#harshrajput #adani #DATING 3/4 thread\n#salon...
2,2,#harshrajput #adani #DATING 2/4 thread\n#salon...
3,3,#harshrajput #adani #DATING 1/4 thread\n#salon...
4,4,"#TotalEnergies, paid $2 billion for #AdaniGree..."
...,...,...
997,997,@yadavakhilesh . #Adani के शेयरों में जो देश क...
998,998,@yadavakhilesh . #Adani के शेयरों में जो देश क...
999,999,. #Adani के शेयरों में जो देश की जनता का पैसा ...
1000,1000,10 Straight Day of NIFTY in negative.\nWhat’s ...


In [17]:
df_3 = ten_in_a_group(data_3,data_3.shape[0])
df_3 = pd.DataFrame(df_3, columns=['text'])
preprocessing(df_3,df_3.shape[0])

In [18]:
df_3.shape

(67, 1)

## Forbes

In [75]:
data_4 = pd.read_csv("#forbes.csv")
data_4

Unnamed: 0.1,Unnamed: 0,text
0,0,#Saudi #Aramco has completed the acquisition o...
1,1,「すべての物にはヒビがある。\n　そして、そこから光は入る。」\n\n#オードリー・タン の...
2,2,10 Creative #TikTok Video Ideas For Brands (Wi...
3,3,"Making money is an art, Working is an art, Bus..."
4,4,@Forbes cited @FMI_Research from our report on...
...,...,...
997,997,RT:(@petapixel) Instagram alternative 100ASA n...
998,998,.@HindenburgRes के बाद अब @Forbes ने गौतम अड़ा...
999,999,15 Low-Budget Strategies For Increasing Cash F...
1000,1000,Social Start Now generates excitement through ...


In [76]:
df_4 = ten_in_a_group(data_4,data_4.shape[0])
df_4 = pd.DataFrame(df_4, columns=['text'])
preprocessing(df_4,df_4.shape[0])

In [77]:
df_4.shape

(81, 1)

## BlackLivesMatter

In [79]:
data_5 = pd.read_csv("#BlackLivesMatter.csv")
data_5

Unnamed: 0.1,Unnamed: 0,text
0,0,#BlackLivesMatter
1,1,It's time to #DefundTheBurtonPolice #BlackLive...
2,2,Sandra Bland #BlackLivesMatter
3,3,BP's forgotten pensioners\n @DavidLammy @bbcn...
4,4,ding* DING. #PennState #BlackLivesMatter #Just...
...,...,...
997,997,ding DING? ding. Ding Ding ding #PennState #Bl...
998,998,#blackcrimesmatter #blacklivesmatter #hypocris...
999,999,https://t.co/yarYLRqJTE #neworleans \n#art #gi...
1000,1000,"WHAT IS DONE IN THE DARK, SOON COMES TO THE LI..."


In [80]:
df_5 = ten_in_a_group(data_5,data_5.shape[0])
df_5 = pd.DataFrame(df_5, columns=['text'])
preprocessing(df_5,df_5.shape[0])

In [81]:
df_5.shape

(84, 1)

## Bitcoin

In [82]:
data_6 = pd.read_csv("#Bitcoin.csv")
data_6

Unnamed: 0.1,Unnamed: 0,text
0,0,Już o 9:30 aktualizacja z rynku krypto i nie t...
1,1,Fiat: All lies\n#Bitcoin: Allies
2,2,@MwahafarN The Wolfshack is looking amazing wi...
3,3,"What’s happening?\n\nOur friend told us, that ..."
4,4,The Sandbox acquires Germany’s Sviper to add m...
...,...,...
997,997,Julian Assange on #Bitcoin in 2014! 🤯
998,998,...so you can have an overview of this emergin...
999,999,Gains aren't real until converted into #bitcoi...
1000,1000,"Make your orders, we deliver fast to any locat..."


In [83]:
df_6 = ten_in_a_group(data_6,data_6.shape[0])
df_6 = pd.DataFrame(df_6, columns=['text'])
preprocessing(df_6,df_6.shape[0])

In [84]:
df_6.shape

(76, 1)

## ChatGPT

In [109]:
data_7 = pd.read_csv("#ChatGPT.csv")
data_7

Unnamed: 0.1,Unnamed: 0,text
0,0,When you finally integrate #ChatGPT API into y...
1,1,"My POV on the uses for ChatGPT: \n- No, it doe..."
2,2,#ChatGPT のAPI ChatCompletion呼び出し、会話履歴も引数として投げる...
3,3,Blown away by this technology #ChatGPT #AI htt...
4,4,✨ #ChatGPT now opens its API! ✨ \n\nI've crea...
...,...,...
997,997,Me puse a jugar con #ChatGPT como si fuera Aki...
998,998,Prompt for copywriting 3 👇👉 https://t.co/xNzks...
999,999,🗞️Founder Of Bored Ape Yacht Club #YugaLabs Un...
1000,1000,Prompt for copywriting 4 👇👉 https://t.co/geQCk...


In [110]:
df_7 = ten_in_a_group(data_7,data_7.shape[0])
df_7 = pd.DataFrame(df_7, columns=['text'])
preprocessing(df_7,df_7.shape[0])

In [111]:
df_7.shape

(67, 1)

## Tesla

In [112]:
data_8 = pd.read_csv("#Tesla.csv")
data_8

Unnamed: 0.1,Unnamed: 0,text
0,0,"Master Plan 3 de #Tesla et Investor Day 2023, ..."
1,1,#ElPaís @elpais_cultura #DigitalArt #Metaverse...
2,2,#Rareearth miners in #China fall #Thursday as ...
3,3,Ya chocaron esto??? \nBuena nota #Tesla #siete...
4,4,Wahoo! Someone was first to spot a 2021 Tesla ...
...,...,...
997,997,Say hello to the latest iteration of the Tesla...
998,998,Concamin reconoce apertura del Gobierno de #AM...
999,999,⭕ La decisión de #Tesla de invertir 10 mil mdd...
1000,1000,⭕ La decisión de #Tesla de invertir 10 mil mdd...


In [113]:
df_8 = ten_in_a_group(data_8,data_8.shape[0])
df_8 = pd.DataFrame(df_8, columns=['text'])
preprocessing(df_8,df_8.shape[0])

In [114]:
df_8.shape

(40, 1)

## ShutDownJNU_tweets

In [119]:
data_9 = pd.read_csv("#ShutDownJNU_tweets.csv")
data_9

Unnamed: 0,user_id,status_id,created_at,screen_name,text,source,display_text_width,reply_to_status_id,reply_to_user_id,reply_to_screen_name,...,statuses_count,favourites_count,account_created_at,verified,profile_url,profile_expanded_url,account_lang,profile_banner_url,profile_background_url,profile_image_url
0,x145914904,x1194250903894675456,2019-11-12 13:49:38,1frm90Migration,We are poor people. \nWe cannot afford the hik...,Twitter for Android,140,,,,...,25680,15616,2010-05-20 05:18:45,False,https://t.co/BF8dPMPgfI,https://twitter.com/1frm90Migration,,https://pbs.twimg.com/profile_banners/14591490...,http://abs.twimg.com/images/themes/theme18/bg.gif,http://pbs.twimg.com/profile_images/9605537582...
1,x999465582482223104,x1194250874291277825,2019-11-12 13:49:31,Sunny74240064,Public won't mind funding college subsidies if...,Twitter for Android,140,,,,...,31412,43670,2018-05-24 01:42:18,False,,,,,,http://abs.twimg.com/sticky/default_profile_im...
2,x592991019,x1194250853873594369,2019-11-12 13:49:26,vvbindu,□ 473 Teachers \n□ 8309 Students \n□ 1276 Adm...,Twitter for Android,140,,,,...,3071,6072,2012-05-28 17:29:57,False,,,,https://pbs.twimg.com/profile_banners/59299101...,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/9237847147...
3,x132134411,x1194250820184707072,2019-11-12 13:49:18,bharathkumarsk,Public won't mind funding college subsidies if...,Twitter for iPhone,140,,,,...,22360,30025,2010-04-12 11:11:28,False,https://t.co/twxHxObKhq,https://twitter.com/twitter,,https://pbs.twimg.com/profile_banners/13213441...,http://abs.twimg.com/images/themes/theme19/bg.gif,http://pbs.twimg.com/profile_images/1169315983...
4,x134399710,x1194250813167689729,2019-11-12 13:49:16,frmchandan,For all those running the hashtag #ShutDownJNU...,Twitter for Android,122,,,,...,73,306,2010-04-18 09:06:59,False,https://t.co/bhbZL0nzOC,http://frmchandan.com,,https://pbs.twimg.com/profile_banners/13439971...,http://abs.twimg.com/images/themes/theme8/bg.gif,http://pbs.twimg.com/profile_images/5092290817...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,x161189354,x1194247341336719360,2019-11-12 13:35:28,kunalclinton,People who are trending #ShutDownJNU are the s...,Twitter for Android,107,,,,...,5977,8866,2010-06-30 06:05:52,False,,,,https://pbs.twimg.com/profile_banners/16118935...,http://abs.twimg.com/images/themes/theme14/bg.gif,http://pbs.twimg.com/profile_images/9596554535...
196,x734373707498590212,x1194247339478671360,2019-11-12 13:35:28,Rajendrashaw16,आज जो लोग #shutdownjnu कह रहे हैं उन्हें ये पत...,Twitter Web App,140,,,,...,26325,54945,2016-05-22 13:21:50,False,,,,,,http://pbs.twimg.com/profile_images/1192899773...
197,x3254322074,x1194247326585352192,2019-11-12 13:35:25,natarrajanappu1,நக்சல் கூடாரமாக இருக்கும் JNU இழுத்து மூடுவது ...,Twitter for Android,140,,,,...,115397,128365,2015-06-24 05:22:07,False,,,,https://pbs.twimg.com/profile_banners/32543220...,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/6331602938...
198,x1106594332751077376,x1194247322126802944,2019-11-12 13:35:24,Thik____Hai,"Sambit Patra, Shazia Ilmi, Zafar Islam, Asifa ...",Twitter for Android,140,,,,...,16994,19197,2019-03-15 16:33:43,False,,,,,,http://pbs.twimg.com/profile_images/1113134293...


In [120]:
df_9 = ten_in_a_group(data_9,data_9.shape[0])
df_9 = pd.DataFrame(df_9, columns=['text'])
preprocessing(df_9,df_9.shape[0])

In [121]:
df_9.shape

(14, 1)

## KashmirFiles

In [126]:
data_10 = pd.read_csv("##KashmirFiles.csv")
data_10

Unnamed: 0.1,Unnamed: 0,text
0,0,@rupen_chowdhury Kashmiri Hindus ka pain unbea...
1,1,@mithunprabahar @woke_Kashmiri @ashoswai For m...
2,2,"Thanks, @jamiatimes_in for giving the space to..."
3,3,@MrSinha_ Remembering the words of #KashmirFil...
4,4,"Shaheed Abdul Rashid dar,God willing, revoluti..."
...,...,...
997,997,राजस्थान की कांग्रेस सरकार ने कश्मीरी पंडितों ...
998,998,राजस्थान की कांग्रेस सरकार ने कश्मीरी पंडितों ...
999,999,Copperware craft of #Kashmir\nThe copperware c...
1000,1000,राजस्थान की कांग्रेस सरकार ने कश्मीरी पंडितों ...


In [127]:
df_10 = ten_in_a_group(data_10,data_10.shape[0])
df_10 = pd.DataFrame(df_10, columns=['text'])
preprocessing(df_10,df_10.shape[0])

In [128]:
df_10.shape

(64, 1)

## Budget2023

In [131]:
data_11 = pd.read_csv("#Budget2023.csv")
data_11

Unnamed: 0.1,Unnamed: 0,text
0,0,मूलभूत सेवाओं पर निरंतर खर्च बढ़ा रही नीतीश सर...
1,1,@CanningsNDP Why don't you speak up for people...
2,2,While the #government has increased expenditur...
3,3,सोनीपत मेट्रोपॉलिटन डेवलपमेंट अथॉरिटी की स्थाप...
4,4,भाजप आमदार राम सातपुते यांनी शरद पवार यांचा एक...
...,...,...
997,997,ICAI Members' Journal The Chartered Accountant...
998,998,"सर्वस्पर्शी बजट है, सभी वर्गों का ध्यान रखा जा..."
999,999,#मध्य प्रदेश- के वित्त मंत्री जगदीश देवड़ा ने ...
1000,1000,#Budget2023 #MPEconomicSurvey #MP_budget #Bhop...


In [132]:
df_11 = ten_in_a_group(data_11,data_11.shape[0])
df_11 = pd.DataFrame(df_11, columns=['text'])
preprocessing(df_11,df_11.shape[0])

In [133]:
df_11.shape

(49, 1)

## Kohli

In [135]:
data_12 = pd.read_csv("#Kohli.csv")
data_12

Unnamed: 0.1,Unnamed: 0,text
0,0,Indian team didn't play spain bowling properly...
1,1,#BreakingNews #BorderGavaskarTrophy2023 #Kohli...
2,2,#news #BreakingNews #AajKiTaazaKhabar #topnews...
3,3,@daniel86cricket @BCCI Hey ! What's up sister...
4,4,"Once again, a disappointing result for Virat K..."
...,...,...
997,997,"Virat Kohli becomes the fastest to score 25,00..."
998,998,Many more to go 😍😍😍 #Kohli #IndiavsAus https:/...
999,999,So Virat Kohli didn't score runs for a long ti...
1000,1000,"King Kohli reaching another milestone, 25000 r..."


In [136]:
df_12 = ten_in_a_group(data_12,data_12.shape[0])
df_12 = pd.DataFrame(df_12, columns=['text'])
preprocessing(df_12,df_12.shape[0])

In [137]:
df_12.shape

(70, 1)

## ShivSenaCheatsMaharashtra_tweets

In [138]:
data_13 = pd.read_csv("#ShivSenaCheatsMaharashtra_tweets.csv")
data_13

Unnamed: 0,user_id,status_id,created_at,screen_name,text,source,display_text_width,reply_to_status_id,reply_to_user_id,reply_to_screen_name,...,statuses_count,favourites_count,account_created_at,verified,profile_url,profile_expanded_url,account_lang,profile_banner_url,profile_background_url,profile_image_url
0,x4695772544,x1193972603464994816,2019-11-11 19:23:46,jadhavdj11,Uddhav Thackrey’s 4 wrong assumptions: \n\n1. ...,Twitter for Android,140,,,,...,27200,34357,2016-01-02 11:42:44,False,,,,https://pbs.twimg.com/profile_banners/46957725...,,http://pbs.twimg.com/profile_images/1186240274...
1,x1161565132754784258,x1193972599463636994,2019-11-11 19:23:45,InfidelApostate,Oh my goodness! Sheikh Hasina the Islamophobe....,Twitter for Android,259,,,,...,4843,1969,2019-08-14 09:08:03,False,https://t.co/h3nCpru4kN,http://instagram.com/sameerguduru,,https://pbs.twimg.com/profile_banners/11615651...,,http://pbs.twimg.com/profile_images/1189627750...
2,x154257983,x1193972581121945600,2019-11-11 19:23:40,ashu21ster,Shiv Sena quit NDA in the center even before s...,Twitter for Android,140,,,,...,79543,65906,2010-06-10 19:51:21,False,,,,https://pbs.twimg.com/profile_banners/15425798...,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1109525770...
3,x757207454484692992,x1193972553921916928,2019-11-11 19:23:34,AshokSingh246,"If BJP decides to abstain from voting, INC+NCP...",Twitter for iPhone,139,,,,...,226908,369386,2016-07-24 13:35:00,False,,,,https://pbs.twimg.com/profile_banners/75720745...,,http://pbs.twimg.com/profile_images/1101245140...
4,x757207454484692992,x1193971831927005184,2019-11-11 19:20:42,AshokSingh246,I was very excited about metro project in Pune...,Twitter for iPhone,140,,,,...,226908,369386,2016-07-24 13:35:00,False,,,,https://pbs.twimg.com/profile_banners/75720745...,,http://pbs.twimg.com/profile_images/1101245140...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,x769913375723692033,x1193970818625073152,2019-11-11 19:16:40,nishankmishra77,"Dear Maharashtra,\nYou might have 10 issues wi...",Twitter for Android,140,,,,...,8085,23395,2016-08-28 15:03:47,False,https://t.co/oA1wuSilpu,https://www.quora.com/profile/Mishra-Nishank?s...,,https://pbs.twimg.com/profile_banners/76991337...,,http://pbs.twimg.com/profile_images/1158438167...
196,x101964419,x1193970817110986753,2019-11-11 19:16:40,vipulkapadiya,महाराष्ट्र में वोटर के साथ शिवसेना का #MahaDho...,Twitter for Android,140,,,,...,1589,3576,2010-01-05 05:08:44,False,,,,,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/9520296990...
197,x75953295,x1193970807891881984,2019-11-11 19:16:38,pratapsodhia,the people of #Maharashtra are never going to ...,Twitter for Android,140,,,,...,44061,6548,2009-09-21 04:02:11,False,,,,,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/7918709064...
198,x2157045175,x1193970797217538053,2019-11-11 19:16:35,amitpant1973,"""न खुदा मिला न विसाले सनम. न इधर के रहे न उधर ...",Twitter for Android,140,,,,...,93639,15250,2013-10-26 15:19:22,False,,,,https://pbs.twimg.com/profile_banners/21570451...,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1190068364...


In [139]:
df_13 = ten_in_a_group(data_13,data_13.shape[0])
df_13 = pd.DataFrame(df_13, columns=['text'])
preprocessing(df_13,df_13.shape[0])

In [140]:
df_13.shape

(11, 1)

## modi

In [143]:
data_14 = pd.read_csv("#modi.csv")
data_14

Unnamed: 0.1,Unnamed: 0,text
0,0,Slowly Modi is getting cornered in his own lie...
1,1,@barandbench see the hypocrisy of this Fuc*kin...
2,2,"""हमें नियमित रूप से अच्छी बातें पढ़नी,उन बातों..."
3,3,'India Our Friend': Saudi Arabia seeks 'highes...
4,4,'Modi Most Loved': Italian PM's gleeful praise...
...,...,...
997,997,#డిజిటల్ #INDIA\n#Modi ji 🚩\n@BJP4India https:...
998,998,And the award for the Greatest Corporate Scam ...
999,999,Such positive news on #Adani will never reach ...
1000,1000,#PMO #MODI #BJP \nஐயா இந்தியாவின் மிகவும் வலிம...


In [144]:
df_14 = ten_in_a_group(data_14,data_14.shape[0])
df_14 = pd.DataFrame(df_14, columns=['text'])
preprocessing(df_14,df_14.shape[0])

In [145]:
df_14.shape

(50, 1)

## NFT

In [146]:
data_15 = pd.read_csv("#NFT.csv")
data_15

Unnamed: 0.1,Unnamed: 0,text
0,0,"""I don't need anger management, I just need pe..."
1,1,New #NFT #Domains :\n\nvanishcode98.wallet\nli...
2,2,Ted #7534 \n0.02 ETH\n32.84 USD\n0xe847d8092...
3,3,let‘s get Mobius OAT 👇\nhttps://t.co/raXdqOupr...
4,4,#NFT Statistics (Real-Time)\nCollection: bored...
...,...,...
1997,1997,TRON Partners with Oraichain for AI Integratio...
1998,1998,TRON Partners with Oraichain for AI Integratio...
1999,1999,🆓Did you know that with our app you can create...
2000,2000,Crypto donations amplify speed and global reac...


In [147]:
df_15 = ten_in_a_group(data_15,data_15.shape[0])
df_15 = pd.DataFrame(df_15, columns=['text'])
preprocessing(df_15,df_15.shape[0])

In [148]:
df_15.shape

(169, 1)

## COVID19

In [149]:
data_16 = pd.read_csv("#COVID19.csv")
data_16

Unnamed: 0.1,Unnamed: 0,text
0,0,@kaenchenkaffee @1900HO #IchHabeMitgemacht\n\n...
1,1,#Scotland mortality review \n2022 Week 09 - 20...
2,2,🔴L'arrêt de la France décidée par #Veran /#Mac...
3,3,0.23 A medida que las #vacunas #COVID19 se imp...
4,4,@Agenzia_Ansa Ora iniziano i piagnistei …ah no...
...,...,...
1997,1997,#COVID19\nhttps://t.co/Kt161uV49I
1998,1998,Unleash the power of OneNote for your Projects...
1999,1999,Hey .@AshishKJha46\n1095 days - and this immun...
2000,2000,Studies are suggesting clearly that heart atta...


In [150]:
df_16 = ten_in_a_group(data_16,data_16.shape[0])
df_16 = pd.DataFrame(df_16, columns=['text'])
preprocessing(df_16,df_16.shape[0])

In [151]:
df_16.shape

(104, 1)

# Merging all data

In [152]:
lst = [df_1, df_2, df_3,df_4, df_5, df_6,df_7, df_8, df_9, df_10, df_11, df_12, df_13, df_14, df_15, df_16]
df_result= pd.concat(lst, ignore_index=True)

In [153]:
df_result

Unnamed: 0,text
0,"Ahead of AyodhyaVerdict, appeal to everyone to..."
1,"WATCH Maulana Asghar Ali Salafi, President, Ma..."
2,For all those of the western media that were g...
3,Reporter's DiaryMet two kids who sell diyas on...
4,Just bcoz BJP got 303 seats in LS everything l...
...,...
1031,Estate Planning is not a Game of Hide and Seek...
1032,COVID19 Coronavirus outbreak updateTotal Cases...
1033,.. ? Absolutely amazing that so MANY curious s...
1034,The people bitching about WoodyHarrelson and h...


### Pegasus Model for Generating Summary

In [157]:
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")

In [159]:
def summary_generator(gen_sum):
    tokens = tokenizer(gen_sum, truncation=True, padding="longest", return_tensors="pt")
    summary = model.generate(**tokens,min_new_tokens=256,no_repeat_ngram_size=1)
    
    temp=""
    temp=tokenizer.decode(summary[0])
    return temp
    

In [162]:
n = df_result.shape[0]
main_df = []
for i in tqdm(range (n), total=n):
    temp = ""
    temp = summary_generator(df_result['text'][i])
    main_df = main_df.append(temp)

  0%|          | 0/1036 [00:00<?, ?it/s]

In [173]:
main_df



In [188]:
y=main_df.split("</s>")

In [189]:
type(y)

list

In [184]:
df_result['text']

0       Ahead of AyodhyaVerdict, appeal to everyone to...
1       WATCH Maulana Asghar Ali Salafi, President, Ma...
2       For all those of the western media that were g...
3       Reporter's DiaryMet two kids who sell diyas on...
4       Just bcoz BJP got 303 seats in LS everything l...
                              ...                        
1031    Estate Planning is not a Game of Hide and Seek...
1032    COVID19 Coronavirus outbreak updateTotal Cases...
1033    .. ? Absolutely amazing that so MANY curious s...
1034    The people bitching about WoodyHarrelson and h...
1035    Remdesivir COVID19 vaxinjured EVERYONE NEEDS T...
Name: text, Length: 1036, dtype: object

In [185]:
x=[]
for t in df_result['text']:
    x.append(t)

In [191]:
y.pop()

''

In [192]:
len(y)

1036

In [193]:
dict={'x':x,'y':y}

In [194]:
pd.DataFrame(dict).to_csv("dataset.csv")

In [195]:
final_df = pd.read_csv("dataset.csv")
final_df

Unnamed: 0.1,Unnamed: 0,x,y
0,0,"Ahead of AyodhyaVerdict, appeal to everyone to...","<pad>""My essay on the AYODHYAVERDICT welcome H..."
1,1,"WATCH Maulana Asghar Ali Salafi, President, Ma...","<pad>Maulana Asghar Ali Salafi, President of M..."
2,2,For all those of the western media that were g...,"<pad>""I am an Indian, I want MandirMasjid issu..."
3,3,Reporter's DiaryMet two kids who sell diyas on...,"<pad>A day after the Ayodhya verdict, here's a..."
4,4,Just bcoz BJP got 303 seats in LS everything l...,<pad>Just bcoz BJP got 303 seats in LS everyth...
...,...,...,...
1031,1031,Estate Planning is not a Game of Hide and Seek...,<pad>The World Health Organization (WHO) has c...
1032,1032,COVID19 Coronavirus outbreak updateTotal Cases...,<pad>The number of people who have died from t...
1033,1033,.. ? Absolutely amazing that so MANY curious s...,<pad>The Covid19 lab leak theory is back in th...
1034,1034,The people bitching about WoodyHarrelson and h...,<pad>Here's a look back at some of the key sto...


In [196]:
def removepad(t):
    t.replace("<pad>","")

In [197]:
final_df['y'].apply(removepad)

0       None
1       None
2       None
3       None
4       None
        ... 
1031    None
1032    None
1033    None
1034    None
1035    None
Name: y, Length: 1036, dtype: object

In [198]:
final_df

Unnamed: 0.1,Unnamed: 0,x,y
0,0,"Ahead of AyodhyaVerdict, appeal to everyone to...","<pad>""My essay on the AYODHYAVERDICT welcome H..."
1,1,"WATCH Maulana Asghar Ali Salafi, President, Ma...","<pad>Maulana Asghar Ali Salafi, President of M..."
2,2,For all those of the western media that were g...,"<pad>""I am an Indian, I want MandirMasjid issu..."
3,3,Reporter's DiaryMet two kids who sell diyas on...,"<pad>A day after the Ayodhya verdict, here's a..."
4,4,Just bcoz BJP got 303 seats in LS everything l...,<pad>Just bcoz BJP got 303 seats in LS everyth...
...,...,...,...
1031,1031,Estate Planning is not a Game of Hide and Seek...,<pad>The World Health Organization (WHO) has c...
1032,1032,COVID19 Coronavirus outbreak updateTotal Cases...,<pad>The number of people who have died from t...
1033,1033,.. ? Absolutely amazing that so MANY curious s...,<pad>The Covid19 lab leak theory is back in th...
1034,1034,The people bitching about WoodyHarrelson and h...,<pad>Here's a look back at some of the key sto...


In [213]:
final_df['x'][600]

'Newsleaders भोपाल  मप्र की ख़बरें,प्रदेश में क्या हुआ दिनभर,देखिए न्यूज़लीडर्स पर ताजा अपडेटNewsleaders Bhopal Indore\xa0 MadhyaPardesh India Khargone Barwani ChouhanShivraj BJP Kamalnath\xa0Congress Jays bjp4mp Johar जयस Budget2023  someone tell us more about the one time 1.4bil set aside for somethimg notaslushfund  for govt use after April 1, just before the election? ablegnevertrustUCPbudget2023DSRFcanada thoughts on the Budget2023 It sounds like increasing therapy. Any creative plans for this, spring break camp, summer camp? Im still not using all of my at home funding.This AlbertaUCP Budget2023 gives an opportunity to the abndpcaucus to be the fiscal ‘conservative election option.  Slow down and target spending to really help ordinary Albertans. The UCP is recklesswithrevenue abpoliDont forget that the petrol cap and subsidies will also be removed before the Budget2023Sunaks next job is to tackle the looming disaster of another hike in energy bills come April. 20 :flushed_face:T

In [214]:
final_df['y'][600]

"<pad>Alberta's Budget2023 has been met with a mixed response on social media, as many are wondering what the government is planning to do about rising energy bills and falling oil prices ahead of elections in May. The UCPis recklesswith abpoliDont forget that there will also be another fuel tax increase"