In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from ntpath import join
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from gensim.parsing.preprocessing import remove_stopwords


In [3]:
with open("sample.txt") as file_in:
    lines = []
    for line in file_in:
        lines.append(line)
doc_test=[]
for i in range(len(lines)):
    doc_test.append(lines[i].split('.'))

final_doc=[]
for i in range(len(doc_test)):
    for j in range(len(doc_test[i])):
        final_doc.append(doc_test[i][j])

without_stopwords=[]
for i in final_doc:
    filtered_sentence=remove_stopwords(i)
    without_stopwords.append(filtered_sentence)

without_stopwords

['Coronavirus disease (COVID-19) outbreak originating Wuhan, China late 2019 spread worldwide claiming 2',
 '5 million lives world 01 March 2021 (1)',
 'On 11 March 2020, World Health Organization (WHO) declared pandemic (1)',
 'Since outbreak disease WHO guidelines prioritized actions responding virus; urged government maintain health facilities, raise public awareness, stock medical supplies (2)',
 '',
 '',
 'Several modeling studies conducted early phases outbreak predict epidemic effectiveness multiple population-wide strategies, including lockdown, social distancing, quarantine, testing contact tracing, media-related awareness mitigate spread COVID-19 (3â€“9)',
 'The strict lockdown enforced limit spread COVID-19 countries Italy, Spain, France, UK steady rise cases Nepal introduced lockdown early phase pandemic (10)',
 'Lockdown blanket approach buys time prepare healthcare (active case finding testing tracing, case management, example, quarantine, isolation treatment, availabilit

In [4]:
# Converting each document into an vector
vectorizer = CountVectorizer()

bag_of_words = vectorizer.fit_transform(without_stopwords)


In [5]:
# print(bag_of_words)

bag_of_words.todense()

# print(bag_of_words.todense())


matrix([[0, 0, 0, ..., 0, 1, 1],
        [1, 0, 0, ..., 1, 0, 0],
        [0, 0, 1, ..., 1, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [6]:
# Singular value decomposition
# This process encodes our original data into topic encoded data
svd = TruncatedSVD(n_components=1)
lsa = svd.fit_transform(bag_of_words)


In [7]:
# Using pandas to look at the output of lsa

topic_encoded_df = pd.DataFrame(lsa, columns=["topic1"])
topic_encoded_df["without_stopwords"] = without_stopwords
topic_encoded_df[["without_stopwords", "topic1"]]


Unnamed: 0,without_stopwords,topic1
0,Coronavirus disease (COVID-19) outbreak origin...,0.9449482
1,5 million lives world 01 March 2021 (1),0.05881787
2,"On 11 March 2020, World Health Organization (W...",0.480738
3,Since outbreak disease WHO guidelines prioriti...,0.8644967
4,,0.0
5,,0.0
6,Several modeling studies conducted early phase...,2.526946
7,The strict lockdown enforced limit spread COVI...,2.79513
8,Lockdown blanket approach buys time prepare he...,1.649458
9,The Government Nepal issued nationwide lockdow...,1.400808


In [8]:
dictionary = vectorizer.get_feature_names()
print(dictionary)


['01', '10', '11', '12', '13', '14', '19', '2019', '2020', '2021', '21', '23', '24', '3â', '610', 'actions', 'active', 'address', 'aimed', 'approach', 'assess', 'availability', 'available', 'awareness', 'basic', 'blanket', 'border', 'brought', 'buys', 'case', 'cases', 'cause', 'chain', 'challenges', 'china', 'claiming', 'closure', 'conducted', 'confine', 'confirmed', 'contact', 'continue', 'coronavirus', 'countries', 'covid', 'deaths', 'declared', 'decline', 'determine', 'disease', 'distancing', 'distribution', 'domestic', 'early', 'eased', 'economy', 'education', 'effectiveness', 'end', 'enforced', 'epidemic', 'equipment', 'essential', 'estimated', 'example', 'expected', 'facilities', 'fatalities', 'fatality', 'finally', 'finding', 'france', 'fundamental', 'general', 'government', 'guidelines', 'health', 'healthcare', 'helpful', 'however', 'impact', 'impacts', 'in', 'including', 'indexed', 'india', 'indicating', 'infectious', 'international', 'introduced', 'isolation', 'issued', 'ital



In [9]:
encoding_matrix = pd.DataFrame(svd.components_,index=["topic1"] ,columns=dictionary).T
encoding_matrix
# numerical values can be thought of as an expression of that word in respective topic



Unnamed: 0,topic1
01,0.001019
10,0.048493
11,0.036093
12,0.020315
13,0.008835
...,...
who,0.023336
wide,0.043841
world,0.009355
worldwide,0.016394


In [10]:
#Considering the words affecting variance in data
encoding_matrix['abs_topic1']=np.abs(encoding_matrix)
encoding_matrix.sort_values('abs_topic1',ascending=False)

Unnamed: 0,topic1,abs_topic1
cases,3.622241e-01,3.622241e-01
lockdown,3.373234e-01,3.373234e-01
19,2.417277e-01,2.417277e-01
covid,2.417277e-01,2.417277e-01
including,2.394512e-01,2.394512e-01
...,...,...
2021,1.018558e-03,1.018558e-03
million,1.018558e-03,1.018558e-03
lives,1.018558e-03,1.018558e-03
01,1.018558e-03,1.018558e-03


In [11]:
final_matrix=encoding_matrix.sort_values('abs_topic1',ascending=False)
final_matrix[["abs_topic1"]]

Unnamed: 0,abs_topic1
cases,3.622241e-01
lockdown,3.373234e-01
19,2.417277e-01
covid,2.417277e-01
including,2.394512e-01
...,...
2021,1.018558e-03
million,1.018558e-03
lives,1.018558e-03
01,1.018558e-03


In [12]:
# Extracting out final sentence from topic 1
sentence1 = final_matrix[final_matrix["abs_topic1"] >= 0.2]
sentence1[['abs_topic1']]


Unnamed: 0,abs_topic1
cases,0.362224
lockdown,0.337323
19,0.241728
covid,0.241728
including,0.239451


In [13]:
index_list = list(sentence1.index.values)
index_list

['cases', 'lockdown', '19', 'covid', 'including']

In [14]:
final_conclusion=[]
for i in range(len(final_doc)):
    for j in range(len(index_list)):
        if index_list[j] in final_doc[i]:
            final_conclusion.append(final_doc[i])
    
final_conclusion

['Coronavirus disease (COVID-19) outbreak originating from Wuhan, China in late 2019 has spread worldwide claiming more than 2',
 'Several modeling studies have been conducted during the early phases of the outbreak to predict the epidemic and effectiveness of multiple population-wide strategies, including lockdown, social distancing, quarantine, testing and contact tracing, and media-related awareness among others to mitigate the spread of COVID-19 (3â€“9)',
 'Several modeling studies have been conducted during the early phases of the outbreak to predict the epidemic and effectiveness of multiple population-wide strategies, including lockdown, social distancing, quarantine, testing and contact tracing, and media-related awareness among others to mitigate the spread of COVID-19 (3â€“9)',
 'Several modeling studies have been conducted during the early phases of the outbreak to predict the epidemic and effectiveness of multiple population-wide strategies, including lockdown, social dista

In [15]:
list_final=list(set(final_conclusion))
# list_final

In [16]:
with open('final_output.txt', 'w') as filehandle:
    for listitem in list_final:
        filehandle.write('%s\n' % listitem)


In [30]:
possible_headings=[]
for i in index_list:
    for j in index_list:
        if i != j:
                possible_headings.append(i+' '+j)

# possible_headings

In [18]:
final_headings=[]
for i in possible_headings:
    split_list=i.split()
    sorted_list=sorted(split_list)
    final_headings.append(sorted_list)

final_headings_str=[]
for i in final_headings:
        final_headings_str.append(i[0]+' '+i[1])

ff_heading=list(set(final_headings_str))
ff_heading


['cases including',
 'including lockdown',
 '19 including',
 'covid including',
 'cases covid',
 '19 lockdown',
 '19 covid',
 'cases lockdown',
 '19 cases',
 'covid lockdown']

In [19]:
vectorizer = CountVectorizer()
bag_of_words1 = vectorizer.fit_transform(ff_heading)


In [20]:
bag_of_words1.todense()


matrix([[0, 1, 0, 1, 0],
        [0, 0, 0, 1, 1],
        [1, 0, 0, 1, 0],
        [0, 0, 1, 1, 0],
        [0, 1, 1, 0, 0],
        [1, 0, 0, 0, 1],
        [1, 0, 1, 0, 0],
        [0, 1, 0, 0, 1],
        [1, 1, 0, 0, 0],
        [0, 0, 1, 0, 1]], dtype=int64)

In [21]:
svd1 = TruncatedSVD(n_components=2)
lsa1 = svd1.fit_transform(bag_of_words1)


In [22]:
topic_encoded_df = pd.DataFrame(lsa1, columns=["topic1","topic2"])
topic_encoded_df["ff_heading"]= ff_heading
display(topic_encoded_df[["ff_heading","topic1","topic2"]])

Unnamed: 0,ff_heading,topic1,topic2
0,cases including,0.894427,-0.447214
1,including lockdown,0.894427,-0.447214
2,19 including,0.894427,0.67082
3,covid including,0.894427,-0.447214
4,cases covid,0.894427,-0.447214
5,19 lockdown,0.894427,0.67082
6,19 covid,0.894427,0.67082
7,cases lockdown,0.894427,-0.447214
8,19 cases,0.894427,0.67082
9,covid lockdown,0.894427,-0.447214


In [23]:
dictionary1 = vectorizer.get_feature_names()
dictionary1




['19', 'cases', 'covid', 'including', 'lockdown']

In [24]:
encoding_matrix1 = pd.DataFrame(
    svd1.components_, index=["topic1", "topic2"], columns=dictionary1).T
encoding_matrix1


Unnamed: 0,topic1,topic2
19,0.447214,0.894427
cases,0.447214,-0.223607
covid,0.447214,-0.223607
including,0.447214,-0.223607
lockdown,0.447214,-0.223607


In [25]:
encoding_matrix1['abs_topic1']=np.abs(encoding_matrix1["topic1"])
encoding_matrix1['abs_topic2']=np.abs(encoding_matrix1["topic2"])
encoding_matrix1.sort_values('abs_topic1',ascending=False)

Unnamed: 0,topic1,topic2,abs_topic1,abs_topic2
covid,0.447214,-0.223607,0.447214,0.223607
including,0.447214,-0.223607,0.447214,0.223607
lockdown,0.447214,-0.223607,0.447214,0.223607
19,0.447214,0.894427,0.447214,0.894427
cases,0.447214,-0.223607,0.447214,0.223607


In [26]:
final_matrix1=encoding_matrix1.sort_values('abs_topic1',ascending=False)
final_matrix1[["abs_topic1","abs_topic2"]]

Unnamed: 0,abs_topic1,abs_topic2
covid,0.447214,0.223607
including,0.447214,0.223607
lockdown,0.447214,0.223607
19,0.447214,0.894427
cases,0.447214,0.223607


In [27]:

sentence3 = final_matrix1[final_matrix1["abs_topic2"] >= 0.4]
sentence3[['abs_topic2']]


Unnamed: 0,abs_topic2
19,0.894427


In [28]:
#selecting two words according to their absolute values.
count=0
heading_final_line=[]
heading_topic_final_df=[]
corr_sorted_desc = []
corr_values=[]
for i in final_matrix1["abs_topic2"]:
    corr_values.append(i)
corr_sorted_desc.extend(sorted(corr_values,reverse=True))
for i in corr_sorted_desc:
    if count==2:
        break
    heading_topic_final_df.append(final_matrix1[final_matrix1["abs_topic2"] == i])
    count=count+1


for i in heading_topic_final_df:
    heading_final_line.extend(i.index.values)

heading_final_line



['19', 'covid', 'including', 'lockdown']

In [29]:
#output 2-word heading in new file
with open('final_heading.txt', 'w') as filehandle:
    for listitem in heading_final_line:
        filehandle.write('%s\n' % listitem)
