# Cleaning the data 

In [28]:
#imports
import pickle
import re
import pandas as pd

#nltk imports
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from collections import Counter

# Gensim imports
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from gensim.models import TfidfModel

In [29]:
# Load the data from the pickle file
with open('all_claims.pkl', 'rb') as f:
    all_claims = pickle.load(f)

In [30]:
# Import NLTK stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [31]:
# Define cleaning function
def clean_text(text):
    # Remove punctuation and convert to lowercase
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords and non-noun/adjective tokens
    filtered_tokens = [word for word, pos in pos_tag(tokens) if word not in stop_words and (pos.startswith('N') or pos.startswith('J'))]
    # Stem tokens
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    return ' '.join(stemmed_tokens)

In [32]:
# Create a DataFrame from the all_claims values
df = pd.DataFrame(all_claims.values())
df = df.dropna()
df = pd.DataFrame(df.values.flatten())

# Clean the text data in the DataFrame
df = df.applymap(clean_text)

In [33]:
# Combine all cleaned text into a single string for vocabulary analysis
text_combined = ' '.join(df.values.ravel())

# Tokenize the text into words
words = re.findall(r'\b\w+\b', text_combined.lower())

# Count the frequency of each word
word_freq = Counter(words)

# Convert word frequency to a DataFrame
word_freq_df = pd.DataFrame.from_dict(word_freq, orient='index', columns=['Frequency'])

# Sort the DataFrame by frequency in descending order
word_freq_df = word_freq_df.sort_values(by='Frequency', ascending=False)
word_freq_df

Unnamed: 0,Frequency
output,78
signal,62
time,62
domain,52
stimulu,46
...,...
calendar,1
rout,1
obtain,1
prioriti,1


In [34]:
# Removing Extra Stop Words and Numbers
stop_words = ['said', 'wherein', 'means', 'claim','claims', 'said', 'wherein', 'mean','first','second', 'method', 'comprising', 'system', 'claim', 'base', 'wherein', 'claim', 'means', 'claim', 'claims1', 'comprising', 'ftirther', 'wherein', 'claim']

# Iterate over each cell in the DataFrame and remove numbers and stop words
for col in df.columns:
    for i in range(len(df[col])):
        # Remove numbers
        df[col][i] = re.sub(r'\b\w*\d\w*\b', '', df[col][i])
        # Remove stop words
        df[col][i] = ' '.join([word for word in df[col][i].split() if word.lower() not in stop_words])

In [35]:
# Function to preprocess text data into words
def preprocess_text(texts):
    processed_texts = []
    for text in texts:
        processed_text = gensim.utils.simple_preprocess(text, deacc=True)
        processed_texts.append(processed_text)
    return processed_texts

In [36]:
# Preprocess text data from DataFrame columns
data_words = []
for col in df.columns:
    data_words.extend(preprocess_text(df[col]))

In [37]:
dictionary = Dictionary(data_words)
corpus = [dictionary.doc2bow(doc) for doc in data_words]

In [38]:
# Create a TF-IDF model
tfidf = TfidfModel(corpus, id2word=dictionary)

# Define the low_value threshold
low_value = 0.2

# Initialize a list to store low-value words
low_value_words = []

# Iterate over each bag-of-words representation in the corpus
for bow in corpus:
    # Get TF-IDF scores for each word in the current bag-of-words representation
    bow_tfidf = tfidf[bow]
    
    # Filter out words with TF-IDF scores below the threshold
    low_value_words += [id for id, value in bow_tfidf if value < low_value]

# Print the low-value words
print("Low-value Words:", low_value_words)

Low-value Words: [0, 3, 4, 5, 7, 8, 10, 11, 13, 15, 16, 17, 18, 20, 5, 0, 3, 4, 5, 10, 11, 13, 16, 17, 18, 20, 21, 23, 24, 26, 27, 28, 29, 30, 34, 38, 39, 40, 18, 32, 42, 0, 2, 5, 0, 2, 5, 0, 4, 5, 7, 8, 10, 11, 13, 15, 16, 17, 20, 27, 28, 29, 30, 42, 5, 0, 4, 5, 10, 11, 16, 17, 20, 21, 23, 24, 26, 34, 39, 42, 20, 21, 23, 24, 26, 0, 0, 21, 23, 24, 26, 8, 20, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60, 62, 64, 65, 66, 67, 69, 70, 72, 73, 74, 75, 76, 77, 79, 81, 82, 74, 42, 68, 74, 42, 49, 57, 59, 74, 20, 73, 84, 68, 101, 68, 101, 20, 8, 20, 49, 51, 52, 54, 55, 56, 57, 58, 59, 62, 65, 66, 67, 69, 70, 72, 73, 74, 76, 77, 79, 80, 82, 95, 101, 109, 49, 54, 57, 59, 66, 74, 101, 66, 68, 101, 66, 68, 101, 20, 66, 73, 84, 101, 20, 49, 50, 52, 56, 57, 58, 59, 63, 64, 72, 73, 80, 82, 95, 110, 66, 68, 74, 74, 74, 110, 20, 51, 84, 110]


In [39]:
dictionary.filter_tokens(bad_ids=low_value_words)
new_corpus = [dictionary.doc2bow(doc) for doc in data_words]

In [40]:
new_data_words = []

# Iterate over each document in data_words
for doc in data_words:
    # Filter out low-value words
    filtered_doc = [word for word in doc if word not in low_value_words]
    # Append the filtered document to new_data_words
    new_data_words.append(filtered_doc)

In [46]:
# Save the cleaned text data, and corpus as pickle files
with open('corpus.pkl', 'wb') as f:
    pickle.dump(df, f)

with open('data_words.pkl', 'wb') as f:
    pickle.dump(data_words, f)

In [45]:
data_words

[['media',
  'gateway',
  'connect',
  'backbon',
  'step',
  'packet',
  'loss',
  'jitter',
  'backbon',
  'call',
  'packet',
  'loss',
  'predefin',
  'packet',
  'loss',
  'threshold',
  'jitter',
  'predefin',
  'jitter',
  'threshold',
  'backbon',
  'indic',
  'call',
  'indic',
  'time',
  'period',
  'predefin',
  'packet',
  'loss',
  'threshold',
  'predefin',
  'jitter',
  'threshold',
  'indic',
  'measur',
  'packet',
  'loss',
  'predefin',
  'packet',
  'loss',
  'threshold',
  'measur',
  'jitter',
  'predefin',
  'jitter',
  'threshold',
  'accept',
  'call',
  'time',
  'period',
  'least',
  'packet',
  'loss',
  'jitter',
  'indic',
  'call',
  'call',
  'backbon',
  'qualiti',
  'servic',
  'qo',
  'level',
  'call',
  'call',
  'backbon'],
 ['indic', 'extern', 'sourc', 'mobil', 'switch', 'centr', 'server'],
 ['indic', 'gateway', 'control', 'protocol', 'gcp', 'messag'],
 ['indic', 'prioriti', 'valu', 'gcp', 'messag'],
 ['gcp', 'messag', 'context', 'request'],
 ['