In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter
from num2words import num2words

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
INPUT_DATA_FOLDER = '2_isw_parsed'
DATA_FILE = 'isw_all_days_parsed.csv'

OUTPUT_FOLDER = '3_isw_proprocessed'
OUTPUT_DATA_FILE = 'all_days_isw_parsed.csv'
OUTPUT_DATA_FILE2 = 'all_days_isw_parsed_v2.csv'

In [3]:
df = pd.read_csv(f'{INPUT_DATA_FOLDER}/{DATA_FILE}', sep = ';')

In [4]:
def remove_one_letter_word(data):
    words = word_tokenize(str(data))
    
    new_text = ""
    for w in words:
        if len(w) > 1:
            new_text = new_text + " " + w
    
    return new_text

In [5]:
def convert_lower_case(data):
    return np.char.lower(data)

In [6]:
def remove_stop_words(data):
    stop_words = set(stopwords.words('english'))
    stop_stop_words = {"no","not"}
    stop_words = stop_words - stop_stop_words
    
    words = word_tokenize(str(data))
    
    new_text=''
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [7]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+—-./:;<=>?@[\]^_`{|}~\n"
    
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data,'  ', ' ')
    data = np.char.replace(data, ',', '')
    return data

In [8]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [9]:
def stemming(data):
    stemmer = PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

def lemmatizing(data):
    lemmatizer = WordNetLemmatizer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + lemmatizer.lemmatize(w)
    return new_text

In [10]:
def conver_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        if w.isdigit():
            if int(w)<1000000000000:
                w = num2words(w)
            else:
                w=''
        new_text = new_text + ' ' + w
    new_text = np.char.replace(new_text,'-',' ')
    
    return new_text

In [11]:
def remove_url_string(data):
    words = word_tokenize(str(data))
    
    new_text = ''
    for w in words:
        w = re.sub(r'^https?:\/\/.*[\r\n]*', '', str(w), flags = re.MULTILINE)
        w = re.sub(r'^http?:\/\/.*[\r\n]*', '', str(w), flags = re.MULTILINE)
        
        new_text = new_text + ' ' + w
        
    return new_text

In [12]:
def preprocess(data, word_root_algo= "lemm"):
    data = remove_one_letter_word(data)
    data = remove_url_string(data)
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = conver_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = conver_numbers(data)
    
    if word_root_algo == "lemm":
        data = lemmatizing(data)
    else:
        data = stemming(data)
        
    data = remove_punctuation(data)
    data = remove_stop_words(data)
    
    return data

In [13]:
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

In [14]:
df['report_text_lemm']= df['main_html_v7'].apply(lambda x: preprocess(x,'lemm'))

In [15]:
df['report_text_stemm']= df['main_html_v7'].apply(lambda x: preprocess(x,'stemm'))

In [16]:
df.head()

Unnamed: 0,date,short_url,title,text_title,full_url,main_html,main_html_v7,report_text_lemm,report_text_stemm
0,2022-02-24,russia_ukraine_warning_update_initial_russian_...,Russia-Ukraine Warning Update: Initial Russian...,Russia-Ukraine Warning Update: Initial Russian...,/backgrounder/russia-ukraine-warning-update-in...,"<div class=""field field-name-body field-type-t...",\nRussian President Vladimir Putin began a lar...,russian presid vladimir putin began larg scal...,russian presid vladimir putin began larg scal...
1,2022-02-25,russia_ukraine_warning_update_russian_offensiv...,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"<div class=""field field-name-body field-type-t...",\nRussian forces entered the outskirts of Kyiv...,russian forc enter outskirt kyiv west bank dn...,russian forc enter outskirt kyiv west bank dn...
2,2022-02-27,russia_ukraine_warning_update_russian_offensiv...,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"<div class=""field field-name-body field-type-t...",\nThe Russian military has likely recognized t...,russian militari like recogn initi expect lim...,russian militari like recogn initi expect lim...
3,2022-02-28,russian_offensive_campaign_assessment_february...,"Russian Offensive Campaign Assessment, Februar...","Russian Offensive Campaign Assessment, Februar...",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...",\nThe Russian military is reorganizing its mil...,russian militari reorgan militari effort atte...,russian militari reorgan militari effort atte...
4,2022-03-01,russian_offensive_campaign_assessment_march_1,"Russian Offensive Campaign Assessment, March 1...","Russian Offensive Campaign Assessment, March 1",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...",\nRussian forces are completing the reinforcem...,russian forc complet reinforc resuppli troop ...,russian forc complet reinforc resuppli troop ...


In [17]:
# df.iloc[0,8]

In [18]:
df.to_csv(f'{OUTPUT_FOLDER}/{OUTPUT_DATA_FILE}', sep =';', index = False)

In [19]:
docs = df['report_text_lemm'].tolist()

In [20]:
cv = CountVectorizer(max_df=0.98, min_df=2)
word_count_vector = cv.fit_transform(docs)

word_count_vector.shape

(334, 6505)

In [21]:
with open('4_model/count_vectorizer_v1.pkl', 'wb') as handle:
    pickle.dump(cv ,handle)

In [22]:
tfidf_transformer =  TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)

In [23]:
with open('4_model/tfidf_transformer_v1.pkl', 'wb') as handle:
    pickle.dump(tfidf_transformer, handle)

In [24]:
df_idf = pd.DataFrame(tfidf_transformer.idf_, index = cv.get_feature_names_out(), columns = ['idf_weights'])
df_idf.sort_values(by = ['idf_weights'])

Unnamed: 0,idf_weights
citi,1.021117
isw,1.021117
effort,1.021117
gener,1.024170
western,1.024170
...,...
moratorium,5.715518
moroccan,5.715518
motion,5.715518
mollifi,5.715518


In [25]:
tf_idf_vector = tfidf_transformer.transform(word_count_vector)

In [26]:
tfidf = pickle.load(open('4_model/tfidf_transformer_v1.pkl', 'rb'))
cv = pickle.load(open('4_model/count_vectorizer_v1.pkl', 'rb'))

In [27]:
feature_names = cv.get_feature_names_out()

In [28]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key = lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn = 10):
    '''get the feature names and tf-idf score of top n items '''
    
    sorted_items = sorted_items[:topn]
    
    score_vals = []
    feature_vals = []
    
    for idx, score in sorted_items:
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
        
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]] = score_vals[idx]
    return results

In [29]:
def conver_doc_to_vector(doc):
    feature_names = cv.get_feature_names_out()
    top_n = 100
    tf_idf_vector = tfidf.transform(cv.transform([doc]))
    
    sorted_items = sort_coo(tf_idf_vector.tocoo())
    
    keywords = extract_topn_from_vector(feature_names, sorted_items,top_n)
    
    return keywords

In [30]:
df['keywords'] = df['report_text_lemm'].apply(lambda x: conver_doc_to_vector(x))

In [32]:
df

Unnamed: 0,date,short_url,title,text_title,full_url,main_html,main_html_v7,report_text_lemm,report_text_stemm,keywords
0,2022-02-24,russia_ukraine_warning_update_initial_russian_...,Russia-Ukraine Warning Update: Initial Russian...,Russia-Ukraine Warning Update: Initial Russian...,/backgrounder/russia-ukraine-warning-update-in...,"<div class=""field field-name-body field-type-t...",\nRussian President Vladimir Putin began a lar...,russian presid vladimir putin began larg scal...,russian presid vladimir putin began larg scal...,"{'pm': 0.381, 'airport': 0.263, 'kyiv': 0.244,..."
1,2022-02-25,russia_ukraine_warning_update_russian_offensiv...,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"<div class=""field field-name-body field-type-t...",\nRussian forces entered the outskirts of Kyiv...,russian forc enter outskirt kyiv west bank dn...,russian forc enter outskirt kyiv west bank dn...,"{'pm': 0.345, 'zero': 0.343, 'februari': 0.329..."
2,2022-02-27,russia_ukraine_warning_update_russian_offensiv...,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"<div class=""field field-name-body field-type-t...",\nThe Russian military has likely recognized t...,russian militari like recogn initi expect lim...,russian militari like recogn initi expect lim...,"{'februari': 0.505, 'seven': 0.345, 'twenti': ..."
3,2022-02-28,russian_offensive_campaign_assessment_february...,"Russian Offensive Campaign Assessment, Februar...","Russian Offensive Campaign Assessment, Februar...",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...",\nThe Russian military is reorganizing its mil...,russian militari reorgan militari effort atte...,russian militari reorgan militari effort atte...,"{'februari': 0.543, 'eight': 0.335, 'twenti': ..."
4,2022-03-01,russian_offensive_campaign_assessment_march_1,"Russian Offensive Campaign Assessment, March 1...","Russian Offensive Campaign Assessment, March 1",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...",\nRussian forces are completing the reinforcem...,russian forc complet reinforc resuppli troop ...,russian forc complet reinforc resuppli troop ...,"{'kyiv': 0.352, 'envelop': 0.284, 'chernihiv':..."
...,...,...,...,...,...,...,...,...,...,...
329,2023-01-21,russian_offensive_campaign_assessment_january_...,"Russian Offensive Campaign Assessment, January...","Russian Offensive Campaign Assessment, January...",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...",\nClick here to see ISW’s interactive map of t...,click see isw interact map russian invas ukra...,click see isw interact map russian inva ukrai...,"{'januari': 0.496, 'teplinski': 0.221, 'milblo..."
330,2023-01-22,russian_offensive_campaign_assessment_january_...,"Russian Offensive Campaign Assessment, Januar...","Russian Offensive Campaign Assessment, Januar...",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...",\nClick here to see ISW’s interactive map of t...,click see isw interact map russian invas ukra...,click see isw interact map russian inva ukrai...,"{'prigozhin': 0.598, 'putin': 0.327, 'wagner':..."
331,2023-01-23,russian_offensive_campaign_assessment_january_...,"Russian Offensive Campaign Assessment, January...","Russian Offensive Campaign Assessment, January...",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...",\nClick here to see ISW’s interactive map of t...,click see isw interact map russian invas ukra...,click see isw interact map russian inva ukrai...,"{'januari': 0.664, 'twenti': 0.301, 'three': 0..."
332,2023-01-24,russian_offensive_campaign_assessment_january_...,"Russian Offensive Campaign Assessment, January...","Russian Offensive Campaign Assessment, January...",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...",\nClick here to see ISW’s interactive map of t...,click see isw interact map russian invas ukra...,click see isw interact map russian inva ukrai...,"{'januari': 0.525, 'twenti': 0.212, 'bakhmut':..."


In [34]:
df.to_csv(f'{OUTPUT_FOLDER}/{OUTPUT_DATA_FILE2}', sep =';', index = False)