#### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import re
import string, unicodedata
import contractions
import codecs
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import spacy
import en_core_web_sm
import time
import gensim
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import sys
from gensim.models import Word2Vec
nlp = en_core_web_sm.load()
nlp.max_length = 600000000
import json

#### Define Corpus

In [164]:
def get_esa_text():
    """Extract ESA Text from text files"""
    esa_text = []
    files = os.listdir("G:/Post Construction/ESA_text")
    for file in files:
        with codecs.open("G:/Post Construction/ESA_text/" + file,'r', encoding='utf-8-sig') as corpus:
            input_str = corpus.read()
            esa_text.append(input_str)
    return esa_text

def get_pcmr_text():
    """Extract PCMR Text from text files"""
    pcmr_text = []
    files = os.listdir("G:/Post Construction/PDF_text")
    for file in files:
        with codecs.open("G:/Post Construction/PDF_text/" + file,'r', encoding='utf-8-sig') as corpus:
            input_str = corpus.read()
            pcmr_text.append(input_str)
    return pcmr_text

def combine_text():
    """combine text string from ESA and PCMR text"""
    esa_corpus = get_esa_text()
    pcmr_corpus = get_pcmr_text()
    corpus = esa_corpus + pcmr_corpus
    return corpus

#### Noise Removal and Text Corpus Normalization

In [165]:
def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)

def remove_non_ascii(text):
    """Remove non-ASCII characters from text string i.e. converting accented characters/letters"""
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def remove_between_delimiters(text):
    """Remove the text between two delimiters < and >"""
    text = re.sub('<[^>]+>', '', text)
    return text

def to_lowercase(text):
    """Convert all characters to lowercase from text string"""
    text = text.lower()
    return text

def lemmatize_text(text):
    """convert word in the text string to its root form"""
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

def remove_special_characters(text, remove_digits = False):
    """Removing non-alphanumeric characters and symbols or even ocasionally numeric characters"""
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

def remove_stopwords(text):
    """Remove stop words from text string"""
    stopword_list = nltk.corpus.stopwords.words('english')     
    return ' '.join(word for word in text.split() if word not in stopword_list)

def prohibitedWords(text):
    """The list of words to be removed from the SQL database issues table to avoid capturing false positives"""
    text = text.split()
    prohibitedWordList = ['issue', 'become', 'therefore', 'monitor', 'compare', 'observe', 'construct', 'part', 'conduct', 'focus', 'prior', 'manage', 'consider', 'moderate', 'condition', 'potential', 'action', 'reassess', 'row', 'impact', 'control', 'management', 'good', 'unique', 'introduce', 'list', 'potentially', 'low', 'establish', 'legislation', 'exist', 'nvc']
    resultwords  = [word for word in text if word not in prohibitedWordList]
    text = ' '.join(resultwords)
    return text

In [166]:
text = "compaction _ ( ) contouring_1 subsidence_1 admixing_1 contouring_1 crowning_1 1 . loss of agricultural capability 2016 â€“ equivalent land capability have be achieve .6 . < s > wetlands</s > scalping résumé and tête-à-tête can't wouldn't"
text = replace_contractions(text)
text = remove_non_ascii(text)
text = remove_between_delimiters(text)
text = to_lowercase(text)
text = remove_special_characters(text, remove_digits= True)
text = lemmatize_text(text)
text = remove_stopwords(text)
print(text)

compaction _ contour _ subsidence _ admix _ contour _ crown _ loss agricultural capability equivalent land capability achieve wetland scalp resume teteatete would


#### Bringing it All Together - Building a Text Normalizer

In [167]:
def normalize_text_corpus(corpus):
    """Normalize each document in the corpus"""
    start_time = time.time()
    normalized_corpus = []
    corpus_size = len(get_pcmr_text())
    for doc in corpus:
        doc = replace_contractions(doc)
        doc = remove_non_ascii(doc)
        doc = remove_between_delimiters(doc)
        doc = to_lowercase(doc)
        doc = remove_special_characters(doc, remove_digits = True)
        doc = lemmatize_text(doc)
        doc = remove_stopwords(doc)
        normalized_corpus.append(doc)
    dur = round(time.time() - start_time)
    print(f"Normalized text from {corpus_size} documents in {dur} seconds ({round(dur / 60, 2)} min or {round(dur / 3600, 2)} hours)")
    return normalized_corpus

#### Further Processing and Tokenization

In [168]:
def sent_to_words(sentences):
    """using Gensim's simple text preprocessing to convert document into a list of tokens, ignoring tokens that are too short or too long"""
    start_time = time.time()
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=False))
    dur = round(time.time() - start_time)
    print(f"Tokenization and further preprocessing completed in {dur} seconds ({round(dur / 60, 2)} min or {round(dur / 3600, 2)} hours)")

In [169]:
normalized_tokens = list(sent_to_words(normalize_text_corpus(get_pcmr_text())))

Normalized text from 584 documents in 1673 seconds (27.88 min or 0.46 hours)
Tokenization and further preprocessing completed in 10 seconds (0.17 min or 0.0 hours)


#### Build Bi-grams

In [170]:
def make_bigrams(normalized_tokens):
    """ create bigrams froms normalized tokens corpus"""
    bigram = gensim.models.Phrases(normalized_tokens, min_count = 18, threshold = 16)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return [bigram_mod[doc] for doc in normalized_tokens]

#min_count: ignore all words and bigrams with total collected count lower than this
#threshold represents a score threshold for forming the phrases (higher means fewer phrases). A phrase of words a followed by b is accepted if the score of the phrase is greater than threshold

normalized_tokens_bigrams = make_bigrams(normalized_tokens)
tokens = 0
for i in normalized_tokens_bigrams:
    tokens += len(i)
print(f"Total tokens in the final corpus: {tokens}")

Total tokens in the final corpus: 3208853


#### Define Hyperparameters

In [171]:
# Set values for various parameters
###############################################
feature_size = 100    # word vector 
window_context = 10   # context window size i.e. maximum distance between current and predicted word within a sentence
min_word_count = 36   # Words that appear only once or twice in a billion-word corpus are probably uninteresting typos and garbage. In addition, there’s not enough data to make any meaningful training on those words, so it’s best to ignore them
sample = 1e-3         # The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5).
learning_rate = 0.01  # the initial learning rate
iterations = 20        # Number of iterations over the corpus

#### Generate Training Data and Model Training

In [172]:
start_time = time.time()
w2v_model = Word2Vec(min_count = min_word_count,
                     window = window_context,
                     size = feature_size,
                     sg = 1,
                     sample = sample,
                     negative = 3,
                     iter = iterations,
                     workers = 1)
w2v_model.build_vocab(normalized_tokens_bigrams)
w2v_model.train(normalized_tokens_bigrams, total_examples=w2v_model.corpus_count, epochs = w2v_model.iter)
dur = round(time.time() - start_time)
print(f"Word2vec model creation and training completed in {dur} seconds ({round(dur / 60, 2)} min or {round(dur / 3600, 2)} hours)")

Word2vec model creation and training completed in 442 seconds (7.37 min or 0.12 hours)


#### Define Target Words

In [221]:
vec_lst = ['physical_environment', 'soil', 'vegetation', 'water', 'fish', 'wetland', 'wildlife', 'species', 'air', 'air_quality', 'acoustic_environment', 'heritage', 'heritage_resource', 'access']
sub_cat_vec_lst = ['erosion', 'coarse_fragment', 'subsidence', 'compaction', 'watercourse', 'invasive', 'plant', 'weed', 'rare', 'stream', 'riparian', 'topsoil']
vec_sub_cat = []
vec_lst.extend(sub_cat_vec_lst)
vec_sub_cat.extend(vec_lst)

In [222]:
root_word_dict = {}
for root_word in vec_sub_cat:
    try:
        context_words = w2v_model.wv.most_similar(positive = [root_word],topn = 18)
        root_word_dict[root_word] = context_words
    except:
        root_word_dict[root_word] = 'The word is not in vocabulary'

In [175]:
word2vec_df = pd.DataFrame.from_dict(root_word_dict)
word2vec_df.to_csv('word2vecembeddings.csv', encoding = 'utf-8-sig')

In [223]:
def merge_keys(dict, key1, key2):
    """Merge values of similar context words together"""
    for context_word in dict[key1]:
        dict[key2].append(context_word)
    del dict[key1]
    return dict

def append_value(dict, key, value):
    """Append values in the dictionary"""
    dict[key].append(value)
    return dict

def append_key_as_value(dict):
    """Append dictionary key as value"""
    for key in dict:
        dict[key].append((key, 1.0))
    return dict

def remove_underscores_duplicates(dict):
    """Remove underscores from the dictionary keys and values bigrams followed by removing duplicates from values"""
    dict_final = {}
    for key,value in dict.items():
        new_key = key.replace('_', ' ')
        new_value = [value[0].replace('_', ' ') for value in dict[key]]
        dict_final[new_key] = new_value
    return {key:list(set(value)) for key, value in dict_final.items()}

def replace_keys(dict, old_keys, new_keys):
    """Replace some of the keys for the purpose of naming consistency in SQL database"""
    for idx, new_key in enumerate(new_keys):
        dict[new_key] = dict.pop((old_keys)[idx])
    return dict

def remove_dictionary_values(dictionary, vec, context_words):
    """Remove the context words which were incorrectly tagged to VECs in word2vec model"""
    for key, value in dictionary.items():
        if key == vec:
            for word in context_words:
                if word in value:
                    value.remove(word)
    return dict

In [224]:
vegetation_context_words = ['wetland', 'subsidence', 'erosion', 'specie'] ## excluded cover
water_context_words = ['erosion']
navigation_context_words = ['landowner', 'revegetation']
wildlife_context_words = ['air quality', 'acoustic environment', 'fish fish', 'habitat', 'special status']
air_context_words = ['acoustic environment', 'habitat', 'special status', 'wildlife', 'traditional land', 'quality', 'risk', 'farm', 'equipment', 'wash', 'part', 'therefore', 'equipment', 'move']
heritage_context_words = ['acoustic environment', 'air quality', 'property', 'special status', 'location']
physical_context_words = ['weed', 'admix', 'productivity', 'issue', 'drainage']
wetlands_context_words = ['vegetation']
acoustic_context_words = ['air quality', 'course fragment', 'wildlife', 'risk', 'habitat', 'special status']
fish_context_words = ['channel', 'stream', 'instream']
species_context_words = ['species', 'specie']

In [225]:
if __name__ == '__main__':
    append_key_as_value(root_word_dict)
    merge_keys(root_word_dict, 'erosion', 'physical_environment')
    merge_keys(root_word_dict, 'coarse_fragment', 'physical_environment')
    merge_keys(root_word_dict, 'subsidence', 'physical_environment')
    merge_keys(root_word_dict, 'compaction', 'soil')
    merge_keys(root_word_dict, 'topsoil', 'soil')
    merge_keys(root_word_dict, 'invasive', 'vegetation')
    merge_keys(root_word_dict, 'plant', 'vegetation')
    merge_keys(root_word_dict, 'weed', 'vegetation')
    merge_keys(root_word_dict, 'rare', 'vegetation')
    merge_keys(root_word_dict, 'watercourse', 'wetland')
    merge_keys(root_word_dict, 'stream', 'wetland')
    merge_keys(root_word_dict, 'riparian', 'wetland')
    merge_keys(root_word_dict, 'heritage_resource', 'heritage')
    merge_keys(root_word_dict, 'air_quality', 'air')
    append_value(root_word_dict, 'species', ('wood frog', 1.0))
    append_value(root_word_dict, 'species', ('turtle', 1.0))
    append_value(root_word_dict, 'species', ('snake', 1.0))
    append_value(root_word_dict, 'fish', ('amphibians', 1.0))
    append_value(root_word_dict, 'species', ('western toad', 1.0))
    append_value(root_word_dict, 'physical_environment', ('crown', 1.0))
    append_value(root_word_dict, 'soil', ('soil and soil productivity', 1.0))
    append_value(root_word_dict, 'air', ('quality', 1.0))
    append_value(root_word_dict, 'species', ('specie at risk', 1.0))
    append_value(root_word_dict, 'access', ('navigation', 1.0))
    dict_final = remove_underscores_duplicates(root_word_dict)
    old_keys = ['physical environment','wetland', 'acoustic environment', 'access']
    new_keys = ['physical', 'wetlands', 'acoustic', 'navigation']
    replace_keys(dict_final, old_keys, new_keys)
    remove_dictionary_values(dict_final, 'vegetation', vegetation_context_words)
    remove_dictionary_values(dict_final, 'water', water_context_words)
    remove_dictionary_values(dict_final, 'wildlife', wildlife_context_words)
    remove_dictionary_values(dict_final, 'air', air_context_words)
    remove_dictionary_values(dict_final, 'heritage', heritage_context_words)
    remove_dictionary_values(dict_final, 'physical', physical_context_words)
    remove_dictionary_values(dict_final, 'wetlands', wetlands_context_words)
    remove_dictionary_values(dict_final, 'acoustic', acoustic_context_words)
    remove_dictionary_values(dict_final, 'fish', fish_context_words)
    remove_dictionary_values(dict_final, 'navigation', navigation_context_words)
    remove_dictionary_values(dict_final, 'species', species_context_words)

In [179]:
# word2vec_clean_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in dict_final.items() ]))
# word2vec_clean_df.to_csv('word2vecembeddings1.csv', encoding = 'utf-8-sig')

In [262]:
from sqlalchemy import create_engine
# Importing environmental variables library that reads from the .env file
from dotenv import load_dotenv

# Loading key-value pairs from the .env file into the OS environment
load_dotenv()

# Reading the key-value pairs from the OS environment
user = os.getenv("DB_USER")
password = os.getenv("DB_PASS")
db_hostname = os.getenv("DB_HOST")
db_name = os.getenv("DB_DATABASE")

# Using those variables in the connection string using "F" strings
conn_string = f"mysql+mysqldb://{user}:{password}@{db_hostname}/{db_name}?charset=utf8mb4"
engine = create_engine(conn_string)

In [293]:
query = "SELECT i.tableId, i.rowIndex, i.vec_pri, i.vec_sec FROM issues i where tableId NOT IN ('02db9f91-572a-44af-9858-4add101353c1','03bfc26a-c6d0-4761-b8f5-47acf2290d02','082134c0-6a4b-425b-a4ae-e79acd7316cb','0d10a967-88d6-42e5-9bd7-309f24022b5f','333e1e53-8897-41fa-acbd-86be8afb31c7','35bd2caf-562c-4d14-a5d6-373f168b4acb','397db969-9996-4d9e-bb05-6df69d0fe4a4','417546c4-dacf-4c12-ae75-4dc4e656e198','491c36c1-82d4-46ae-a684-470915a5659b','60b3993d-7075-4790-8519-ba8193579754','64a7ba33-ceee-4593-87a3-8f08dd46c8f4','67691780-af41-414b-a0c2-aa33a3442cdc','6a2f1370-1cd5-4ebb-a4bd-a1fe9d5a516a','6b437f67-967b-4ef6-bd28-5ac8d39138e4','77cc0b8d-8244-4622-8d9d-a56daf6069e8','8bb683d9-f7ee-4a54-ad3d-dddc61ccdfcf','9476acc2-294a-4cd6-a952-8274aedb645a','a6623233-9c9f-436b-ad11-0987ab3825e7','c04807de-2df1-4d26-9352-70d3cb6cb10b','cb197d7e-3ef6-4ee0-93d1-504c7286b580','f143c6b8-cf77-41c1-88b2-e7c97ba657c1','f2ebd484-4ec2-4481-907d-17334ca4657f','f4db9fc5-3a73-499a-ab1e-ab643530ea99','fdb3d057-943a-4fab-99ac-1f4eed471512','44a33e5f-d99e-48ef-ad56-bbb516ec8796','bfafbfd0-8bb5-4283-8f5e-dd7cbcec480c', '3e9e6cdb-f812-4832-b69c-b8ec0396d585');"

with engine.connect() as conn:
    df = pd.read_sql(query, conn)
    df1 = df.copy()
    df = df.applymap(lambda x: x.strip() if type(x)==str else x) # delete whitespaces
    df.vec_pri = df.vec_pri.replace('\s+', ' ', regex=True) # delete extra space between text strings
    df.vec_sec = df.vec_sec.replace('\s+', ' ', regex=True)
    df.vec_pri = df.vec_pri.str.lower()
    df.vec_sec = df.vec_sec.str.lower()
    df['vec_pri'].fillna('', inplace = True)
    df['vec_sec'].fillna('', inplace = True)
    df['vec_pri'] = df['vec_pri'].apply(remove_between_delimiters)
    df['vec_sec'] = df['vec_sec'].apply(remove_between_delimiters)
    df['vec_pri'] = df['vec_pri'].apply(remove_special_characters, remove_digits = True)
    df['vec_sec'] = df['vec_sec'].apply(remove_special_characters, remove_digits = True)
    df['vec_pri'] = df['vec_pri'].apply(lemmatize_text)
    df['vec_sec'] = df['vec_sec'].apply(lemmatize_text)
    df['vec_pri'] = df['vec_pri'].apply(prohibitedWords)
    df['vec_sec'] = df['vec_sec'].apply(prohibitedWords)
# #df.loc[df.vec_pri.str.contains("(?i)physical environment", na = False), 'physical'] = 1

In [294]:
vec_keyword_count = []
vec_keywords = []

for index, row in enumerate(df.itertuples()):
    issue_keyword_count = []
    
    for key, value in dict_final.items():
        counter = 0
        keyword = []
        for vec in value:
            if re.search(r'\b' + vec + r'\b', row.vec_pri):
                keyword.append(vec)
                counter += 1
        issue_keyword_count.append(counter)
        vec_keywords.append(keyword)
        
    if sum(issue_keyword_count) == 0:
        issue_keyword_count = []
        keyword = []
        for key, value in dict_final.items():
            idx = 0
            for vec in value:
                if re.search(r'\b' + vec + r'\b', row.vec_sec):
                    keyword.append(vec)
                    idx += 1
            issue_keyword_count.append(idx)
        vec_keywords.append(keyword)
            
    vec_keyword_count.append(issue_keyword_count)

In [295]:
# Create the pandas DataFrame  
df2 = pd.DataFrame(vec_keyword_count, columns = dict_final.keys()) 
df2['threshold'] = 0

In [296]:
s = np.where(df2.gt(df2['threshold'],0), ['soil, ', 'vegetation, ', 'water, ', 'fish, ', 'wildlife, ', 'species, ', 'air, ','heritage, ', 'physical, ', 'wetlands, ', 'acoustic, ', 'navigation, ', ''], '')
vecs = pd.Series([''.join(x).strip(', ') for x in s], name = "VECs")
df3 = vecs.to_frame()

In [297]:
df4 = pd.concat([df1, df2, df3], axis = 1)

In [298]:
df4 = df4.assign(VECs=df4.VECs.str.split(", ")).explode('VECs')
#df4.assign(Book=df.Book.str.split(",")).explode('Book')

In [299]:
query = "SELECT tableId, rowIndex, rowCounter, issue_parsed FROM issues_parsed;"
with engine.connect() as conn:
    issues_parsed_df = pd.read_sql(query, conn)
    issues_parsed_df_copy = issues_parsed_df.copy()
    issues_parsed_df = issues_parsed_df.applymap(lambda x: x.strip() if type(x)==str else x) # delete whitespaces
    issues_parsed_df.issue_parsed = issues_parsed_df.issue_parsed.replace('\s+', ' ', regex=True) # delete extra space between text strings
    issues_parsed_df.issue_parsed = issues_parsed_df.issue_parsed.str.lower()
    issues_parsed_df['issue_parsed'].fillna('', inplace = True)
    issues_parsed_df['issue_parsed'] = issues_parsed_df['issue_parsed'].apply(remove_between_delimiters)
    issues_parsed_df['issue_parsed'] = issues_parsed_df['issue_parsed'].apply(remove_special_characters, remove_digits = True)
    issues_parsed_df['issue_parsed'] = issues_parsed_df['issue_parsed'].apply(lemmatize_text)
    issues_parsed_df['issue_parsed'] = issues_parsed_df['issue_parsed'].apply(prohibitedWords)

In [300]:
vec_keyword_count_ip = []
for index, row in enumerate(issues_parsed_df.itertuples()):
    issue_keyword_count_ip = []
    for key, value in dict_final.items():
        counter = 0
        for vec in value:
            if re.search(r'\b' + vec + r'\b', row.issue_parsed):
                counter += 1
        issue_keyword_count_ip.append(counter)           
    vec_keyword_count_ip.append(issue_keyword_count_ip)

In [301]:
# Create the pandas DataFrame  
vec_count_ip_df = pd.DataFrame(vec_keyword_count_ip, columns = dict_final.keys()) 
vec_count_ip_df['threshold'] = 0

In [302]:
s_issue_parsed = np.where(vec_count_ip_df.gt(vec_count_ip_df['threshold'],0), ['soil, ', 'vegetation, ', 'water, ', 'fish, ', 'wildlife, ', 'species, ', 'air, ','heritage, ', 'physical, ', 'wetlands, ', 'acoustic, ', 'navigation, ', ''], '')
vecs_issue_parsed = pd.Series([''.join(x).strip(', ') for x in s_issue_parsed], name = "VECs")
df3_issue_parsed = vecs_issue_parsed.to_frame()

In [303]:
df4_issue_parsed = pd.concat([issues_parsed_df_copy, vec_count_ip_df, df3_issue_parsed], axis = 1)
df4_issue_parsed = df4_issue_parsed.assign(VECs=df4_issue_parsed.VECs.str.split(", ")).explode('VECs')

In [304]:
def read_data():
    final_df = df4.append(df4_issue_parsed, ignore_index=True, sort=False)
    return final_df.where(pd.notnull(final_df), None)

In [305]:
def populate_vecs_table():
    insert_vec_query = 'INSERT INTO word2vec (tableId, rowIndex, rowCounter, word2vec_vec) VALUES (%s, %s, %s, %s);'
    data = read_data()
    with engine.connect() as conn:
        for row in data.itertuples():
            conn.execute(insert_vec_query, (row.tableId, row.rowIndex, row.rowCounter, row.VECs))
    print("Done")

In [306]:
populate_vecs_table()

Done


In [307]:
query = "SELECT * FROM word2vec;"
with engine.connect() as conn:
    word2vec_df = pd.read_sql(query, conn)
word2vec_df.to_csv('vali1.csv')

In [None]:
'''def populate_word2vec_table():
    data = df3.to_dict('records')
    insert_query = 'INSERT INTO word2vec (word2vec_vec, tableId, rowIndex) VALUE (%s, %s, %s);'
    with engine.connect() as conn:
        for item in data:
            conn.execute(insert_query, (item['VECassigned'], item['tableId'], item['rowIndex']))
    print("Done") ## If it fails to insert all rows, it could be because of foreign key constraint error. Refer this link: https://stackoverflow.com/questions/2965837/insert-statement-conflicted-with-the-foreign-key-constraint-sql-server
populate_word2vec_table()'''

In [None]:
'''df = pd.read_csv('file.csv', encoding='cp1252')
df.head()

In [None]:
'''df = df.applymap(lambda x: x.strip() if type(x)==str else x) # delete whitespaces
df.filing_manual_text = df.filing_manual_text.replace('\s+', ' ', regex=True) # delete extra space between text strings
df.filing_manual_text = df.filing_manual_text.str.lower()
df['filing_manual_text'] = df['filing_manual_text'].apply(lemmatize_text)
df['filing_manual_text'] = df['filing_manual_text'].apply(prohibitedWords)'''

In [None]:
vec_keyword_count = []
vec_keywords = []

for index, row in enumerate(df.itertuples()):
    issue_keyword_count = []
    
    for key, value in dict_final.items():
        counter = 0
        keyword = []
        for vec in value:
            if re.search(r'\b' + vec + r'\b', row.filing_manual_text):
                keyword.append(vec)
                counter += 1
        issue_keyword_count.append(counter)
        vec_keywords.append(keyword)
            
    vec_keyword_count.append(issue_keyword_count)

vec_keyword_count

#### visualizing the Words

In [None]:
similar_words = {search_term: [item[0] for item in w2v_model.wv.most_similar([search_term], topn = 2)] for search_term in ['physical', 'soil', 'erosion', 'vegetation', 'water', 'fish', 'wetland', 'wildlife', 'specie', 'air']}

In [None]:
def tsne_plot(model):
    "Create TSNE model and plot it"
    words = sum([[k] + v for k, v in similar_words.items()], [])
    wvs = w2v_model.wv[words]
    tsne_model = TSNE(perplexity = 2, n_components = 2, n_iter = 10000, random_state = 0)
    np.set_printoptions(suppress = True)
    T = tsne_model.fit_transform(wvs)
    labels = words
    plt.figure(figsize=(14, 8))
    plt.scatter(T[:, 0], T[:, 1], c = 'orange', edgecolors = 'r')
    for label, x, y in zip(labels, T[:, 0], T[:, 1]):
        plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points')
    return plt.show()

In [None]:
tsne_plot(w2v_model)