Installing libraries

In [1]:
import stanza
import numpy as np
import pandas as pd

In [2]:
import nltk
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup
import os
import contractions
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')

from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

import re
# import pickle
from emot.emo_unicode import UNICODE_EMOJI # For emojis
from emot.emo_unicode import EMOTICONS_EMO # For EMOTICONS

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/venkatasaisumanthsadu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/venkatasaisumanthsadu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/venkatasaisumanthsadu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/venkatasaisumanthsadu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Importing bad words vocab and dataset

In [3]:
bad_words=[]
with open('./data/bad-words.csv') as f:
    for line in f.readlines():
        bad_words.append(line.split('\n')[0])

bad_words_set = set(bad_words)
dataset = pd.read_csv('news_comments.csv')

In [4]:
dataset

Unnamed: 0.1,Unnamed: 0,comments
0,0,Man this whole rivalry between Michigan and Oh...
1,1,I get this feeling that derailments happen A L...
2,2,They happen a lot. I work in shipping and use ...
3,3,Weird that when we don't invest in infrastruct...
4,4,"Well, they've clearly learned from East Palest..."
...,...,...
51698,51698,Ukraine also has the average of superior train...
51699,51699,Does NK make their own shells? Or was Russia b...
51700,51700,counterpoint: if there's one thing Best Korea ...
51701,51701,American intelligence is absolutely crucial to...


In [10]:
def remove_urls(text):
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE)
    return (text)

def remove_contractions(text) :
    expanded_words = []
    expanded_text = ''
    for word in text.split():
        expanded_words.append(contractions.fix(word))
        expanded_text = ' '.join(expanded_words)
    return expanded_text


remove_non_english = lambda s: re.sub(r'[^a-zA-z]', ' ', s)
remove_spaces = lambda s: re.sub(' +',' ', s)


In [11]:

def cleaning(text):
    #remove urls
    text = remove_urls(text)
    #remove html tags
    text = BeautifulSoup(text, "lxml").text
    #remove contractions 
    text = remove_contractions(text)
    #remove non-alphabetic chars 
    text = remove_non_english(text)
    #lowercase
    text = text.lower( )
    #remove extra spaces 
    text = remove_spaces(text)
    
    return text

In [12]:
dataset['cleaned_comments'] = list(map(cleaning, dataset.comments))

def convert_emojis(text):
    for emot in UNICODE_EMOJI:
        text = text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",","").replace(":","").split()))
    return text

dataset['cleaned_comments'] = dataset['cleaned_comments'].apply(lambda row: convert_emojis(str(row)))



In [None]:
# from nltk.corpus import stopwords

# to_remove = ['not']
# new_stopwords = set(stopwords.words('english')).difference(to_remove)

# dataset['cleaned_comments'] = dataset['cleaned_comments'].apply(lambda x: " ".join(x for x in x.split() if x not in new_stopwords))

# from nltk.stem import WordNetLemmatizer
# lemmatizer = WordNetLemmatizer()

# dataset['cleaned_comments'] = dataset['cleaned_comments'].apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split()]))


Importing Stanza POS tagger

In [13]:
pos_tagger = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos',use_gpu=True)

2023-03-21 15:29:24 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-03-21 15:29:25 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |

2023-03-21 15:29:25 INFO: Using device: cpu
2023-03-21 15:29:25 INFO: Loading: tokenize
2023-03-21 15:29:25 INFO: Loading: pos
2023-03-21 15:29:25 INFO: Done loading processors!


POS Tags for our dataset

In [14]:
dataset_sample = dataset[:1000].copy()

In [15]:
dataset_sample['cleaned_comments'][1]

'i get this feeling that derailments happen a lot and we are only paying attention now because of what happened in ohio that said why the fuck do they happen so often '

In [16]:
pos_tags_dataset_sample =[]
for sentence in dataset_sample.iloc[:,2].values.tolist():
    doc = pos_tagger(sentence)
    pos_tags_dataset_sample.append([word.xpos for sent in doc.sentences for word in sent.words])

Writing the tags to a txt file (Simply)

In [17]:
with open('pos_tags_dataset_sample.txt','w') as f:
    f.writelines([str(i) for i in pos_tags_dataset_sample])

In [93]:
# pos_tags_dataset_sample=[]
# with open('/Users/venkatasaisumanthsadu/Documents/clg-documents/CSCI544/project/CSCI544_NLP_Project-main/pos_tags_dataset_sample.txt','r') as f:
#     pos_tags_dataset_sample.append(f.readlines())

Labelling the dataset

In [29]:
with open('pos_tags_dataset_sample.txt','r') as f:
    Lines = f.readlines()

    # print([tag.strip("'") for tag in Lines[2].strip('\n').split(', ')])
    pos_tags_dataset_sample = []
    for i in range(len(Lines)):
        pos_tags_dataset_sample.append([tag.strip("'") for tag in Lines[i].strip('\n').split(', ')])

In [30]:
len(dataset['comments'][0].split(' '))

13

In [31]:
len(dataset['comments'][1][:].split(' '))

31

In [33]:
pos_tags_dataset_sample[0]

['NN',
 'DT',
 'JJ',
 'NN',
 'IN',
 'NNP',
 'CC',
 'NNP',
 'VBZ',
 'VBG',
 'IN',
 'IN',
 'NN']

In [34]:
len(dataset_sample['cleaned_comments'][2].split())

35

In [35]:
dataset_sample['cleaned_comments'][0]

'man this whole rivalry between michigan and ohio is getting out of hand '

In [55]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [45]:
dataset_sample['pos_tag'] = pos_tags_dataset_sample

In [56]:
dataset_sample.iloc[0,2:5]

cleaned_comments    man this whole rivalry between michigan and ohio is getting out of hand 
labels                                                                                     0
pos_tag                             [NN, DT, JJ, NN, IN, NNP, CC, NNP, VBZ, VBG, IN, IN, NN]
Name: 0, dtype: object

In [36]:
labels= []
word_indices=[]
for i,sentence in enumerate(dataset_sample['cleaned_comments']):
    #capture bw indices for each sentence
    flag = False
    for j,word in enumerate(sentence.split()):
        if word.lower() in bad_words_set:
            flag = True
            pos_tags_dataset_sample[i][j] = 'BW'
            print(i,j)
            continue
    labels.append(1) if flag else labels.append(0)

1 26
7 10
7 16
15 31
16 12
16 48
20 26
23 25
25 58
29 11
34 35
34 43
34 52
34 64
35 10
36 4
39 18
40 9
44 0
45 2
47 9
54 25
62 12
62 13
67 1
70 4
71 8
77 6
79 1
85 42
85 48
86 17
87 8
87 77
87 108
87 112
87 139
91 2
93 2
94 55
100 9
101 16
101 17
103 21
103 38
103 39
104 25
104 42
105 1
114 5
115 30
115 42
118 2
120 4
122 13
127 17
132 6
132 24
132 52
133 2
136 23
141 2
151 10
151 33
159 12
161 27
162 40
163 4
165 7
169 21
178 14
179 20
179 34
180 14
185 1
185 2
187 13
190 6
191 3
199 7
199 15
202 2
203 0
204 0
208 48
210 4
211 5
214 9
216 0
217 3
218 66
218 95
219 6
221 8
222 0
225 11
228 8
235 7
236 8
236 18
236 24
236 25
236 30
236 46
236 54
237 36
237 45
239 4
241 0
242 22
242 23
242 25
242 38
242 44
244 13
249 0
252 1
252 23
252 59
256 1
256 3
260 14
263 2
269 6
269 19
275 7
276 27
276 28
278 23
278 28
282 38
283 9
287 13
290 0
292 13
297 4
297 21
300 2
302 2
307 4
308 1
311 17
321 12
322 49
328 1
332 26
334 12
339 28
340 10
349 3
349 11
354 13
355 0
355 3
355 6
357 32
358 5
361 1

Percentage of bad sentences

In [57]:
np.count_nonzero(labels)

275

In [58]:
np.count_nonzero(labels)/len(labels)

0.275

Separating good and bad sentences

In [59]:
dataset_sample['labels'] = labels

In [60]:
good_sentences = dataset_sample[dataset_sample['labels']==0]
bad_sentences = dataset_sample[dataset_sample['labels']==1]

In [61]:
len(good_sentences)
good_sentences.to_csv('good_sentences.csv')

In [62]:
len(bad_sentences)
bad_sentences.to_csv('bad_sentences.csv')

In [63]:
sample_bad = bad_sentences.sample(10)

In [64]:
sample_bad

Unnamed: 0.1,Unnamed: 0,comments,cleaned_comments,labels,pos_tag
349,349,Is it still “conspiracy” to be skeptical that maybe some internal attack is or could be happening under situations that are always getting passed up as “statistically normal”?,is it still conspiracy to be skeptical that maybe some internal attack is or could be happening under situations that are always getting passed up as statistically normal,1,"[VBZ, PRP, RB, BW, TO, VB, JJ, IN, RB, DT, JJ, BW, VBZ, CC, MD, VB, VBG, IN, NNS, WDT, VBP, RB, VBG, VBN, RP, IN, RB, JJ]"
994,994,"In fairness, the article has a photo in this case and it do be lookin like a \*crash\* crash.",in fairness the article has a photo in this case and it do be lookin like a \ crash\ crash,1,"[IN, NN, DT, NN, VBZ, DT, NN, IN, DT, NN, CC, PRP, VBP, VB, VBG, IN, DT, HYPH, NN, BW]"
806,806,"Am I stupid in thinking that this seems like a job a machine could do and eliminate this type of error? Yeah, I get that it would eliminate a job here but the cost of that seems much better than having what happened in Ohio in other places happen again.",am i stupid in thinking that this seems like a job a machine could do and eliminate this type of error yeah i get that it would eliminate a job here but the cost of that seems much better than having what happened in ohio in other places happen again,1,"[VBP, PRP, BW, IN, VBG, IN, DT, VBZ, IN, DT, NN, DT, NN, MD, VB, CC, VB, DT, NN, IN, NN, UH, PRP, VBP, IN, PRP, MD, VB, DT, NN, RB, CC, DT, NN, IN, DT, VBZ, RB, JJR, IN, VBG, WP, VBD, IN, NNP, IN, JJ, NNS, VB, RB]"
658,658,America finished fucking around and now we are finding out.,america finished fucking around and now we are finding out,1,"[NNP, VBD, BW, RB, CC, RB, PRP, VBP, VBG, RP]"
427,427,"Oh God first the giant spy balloons now the hazmat train derailments, what else have they been keeping from us...",oh god first the giant spy balloons now the hazmat train derailments what else have they been keeping from us,1,"[UH, BW, RB, DT, JJ, NN, NNS, RB, DT, NN, NN, VBZ, WP, RB, VBP, PRP, VBN, VBG, IN, PRP]"
235,235,Maybe we should give the workers paid sick leave,maybe we should give the workers paid sick leave,1,"[RB, PRP, MD, VB, DT, NNS, VBN, BW, NN]"
322,322,"So.....I'm starting to get a funny feeling I'm causing these. Due to some concerns we are having my family is debating moving to a lower COL locations where we can find work...Those options and the leading options were Ohio....then derailment...And...Michigan. Are you fucking kidding me?Based on this. Minnesota and Texas have derailments, I'm pretty sure I'm a super villain guys.",so i am starting to get a funny feeling i am causing these due to some concerns we are having my family is debating moving to a lower col locations where we can find work those options and the leading options were ohio then derailment and michigan are you fucking kidding me based on this minnesota and texas have derailments i am pretty sure i am a super villain guys,1,"[RB, PRP, VBP, VBG, TO, VB, DT, JJ, NN, PRP, VBP, VBG, DT, IN, IN, DT, NNS, PRP, VBP, VBG, PRP$, NN, VBZ, VBG, VBG, IN, DT, JJR, NN, NNS, WRB, PRP, MD, VB, NN, DT, NNS, CC, DT, VBG, NNS, VBD, NNP, RB, NN, CC, NNP, VBP, PRP, BW, VBG, PRP, VBN, IN, DT, NNP, CC, NNP, VBP, NNS, PRP, VBP, RB, JJ, PRP, VBP, DT, JJ, NN, NNS]"
180,180,"This is what happens when Republicans save their rich buddies a fortune while gutting American Infrastructure . This is the ""America First"" party as they call themselves .",this is what happens when republicans save their rich buddies a fortune while gutting american infrastructure this is the america first party as they call themselves,1,"[DT, VBZ, WP, VBZ, WRB, NNPS, VBP, PRP$, JJ, NNS, DT, NN, IN, VBG, BW, NN, DT, VBZ, DT, NNP, JJ, NN, IN, PRP, VBP, PRP]"
237,237,"Aussie here, but I keep hearing how Biden reduced the money required to maintain the rail network in America? Is this true? Sounds similar to how our former prime minister reduced the money for pre bush fire season and half of Auatralia was basically on fire.",aussie here but i keep hearing how biden reduced the money required to maintain the rail network in america is this true sounds similar to how our former prime minister reduced the money for pre bush fire season and half of auatralia was basically on fire,1,"[NN, RB, CC, PRP, VBP, VBG, WRB, NNP, VBD, DT, NN, VBN, TO, VB, DT, NN, NN, IN, NNP, VBZ, DT, JJ, VBZ, JJ, IN, WRB, PRP$, JJ, JJ, NN, VBD, DT, NN, IN, NN, NNP, BW, NN, CC, NN, IN, NN, VBD, RB, IN, BW]"
382,382,"[This article](https://www.independent.co.uk/news/world/americas/train-derailments-in-2023-how-many-b2283418.html) has this metric:> The Bureau of Transportation Statistics records 54,539 train derailments between 1990 to 2021, an average of 1,704 per year.Also includes:> Rail experts have argued, however, that the East Palestine crash was the inevitable result of compromised safety measures and reduced workforces, part of an effort to boost rail company profits.",[this article] derailments in how many b html has this metric the bureau of transportation statistics records train derailments between to an average of per year also includes rail experts have argued however that the east palestine crash was the inevitable result of compromised safety measures and reduced workforces part of an effort to boost rail company profits,1,"[-LRB-, DT, NN, -RRB-, VBZ, IN, WRB, JJ, NN, NN, VBZ, DT, NN, DT, NN, IN, NN, NNS, VBZ, NN, NNS, IN, IN, DT, NN, IN, IN, NN, RB, VBZ, NN, NNS, VBP, VBN, RB, IN, DT, BW, NNP, NN, VBD, DT, JJ, NN, IN, VBN, NN, NNS, CC, VBN, NNS, NN, IN, DT, NN, TO, VB, NN, NN, NNS]"


Lucene

In [None]:
# import lucene
# from java.io import StringReader
# from org.apache.lucene.analysis.standard import StandardAnalyzer
# from org.apache.lucene.index import DirectoryReader
# from org.apache.lucene.search.similarities import ClassicSimilarity
# from org.apache.lucene.search import IndexSearcher
# from org.apache.lucene.queryparser.classic import QueryParser
# from org.apache.lucene.store import SimpleFSDirectory
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity

# # Initialize Lucene
# lucene.initVM()

# # Set up the analyzer and similarity algorithm
# analyzer = StandardAnalyzer()
# similarity = ClassicSimilarity()

# # Set up the index
# index_dir = SimpleFSDirectory(File("index"))
# searcher = IndexSearcher(DirectoryReader.open(index_dir))
# searcher.setSimilarity(similarity)

# # Define the query and tags
# query = "python programming"
# tags = ["python", "programming"]

# # Tokenize and vectorize the tags using TF-IDF
# tfidf = TfidfVectorizer(analyzer='word', stop_words='english')
# tfidf_matrix = tfidf.fit_transform(tags)

# # Search the index for similar sentences
# query_parser = QueryParser("content", analyzer)
# query = query_parser.parse(query)
# top_docs = searcher.search(query, 10)
# for score_doc in top_docs.scoreDocs:
#     doc = searcher.doc(score_doc.doc)
#     sentence = doc.get("content")
#     tfidf_score = cosine_similarity(tfidf_matrix, tfidf.transform([sentence]))[0][0]
#     if tfidf_score > 0.5:
#         print(f"Similar sentence found: {sentence}")

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define the sentences
sentences = [
    # "The quick brown cat jumps over the lazy dog"
]

# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Compute the tf-idf matrix
tfidf_matrix = vectorizer.fit_transform(sentences)

# Compute the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(tfidf_matrix)

# Print the similarity matrix
print(cosine_sim_matrix)


[[1.         0.25861529 0.25861529]
 [0.25861529 1.         0.25861529]
 [0.25861529 0.25861529 1.        ]]


In [4]:
# with open('pos_tags_dataset_duplicate.txt','r') as f:
#     Lines = f.readlines()

#     # print(Lines[0])
#     lst = []
#     for i in range(len(Lines)):
#         lst.append(list(Lines[i]))

'NN', 'DT', 'JJ', 'NN', 'IN', 'NNP', 'CC', 'NNP', 'VBZ', 'VBG', 'IN', 'IN', 'NN', '.'

