# To evaluate the optimal number of keywords and to calculate how well the dictionary results match the validated data

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.graph_objects as go
import plotly.express as px

## Dictionary method

In [None]:
dict_method_comparison_dir = '../dict_method_comparison'
Path(dict_method_comparison_dir).mkdir(parents=True, exist_ok=True)

shooting_keywords = {'Plano':["spencer hight","dallas cowboys","meredith","dallas","plano","caleb edwards","deffner",
                              "rushin",'hight','husband killed wife','estranged wife','eight people',
                              'football-watching party'],
                     'Pittsburgh':["pittsburgh","synagogue","bowers","tree life","squirrel hill","jewish",
                                   "anti-semitism","jews",'11 people','anti-Semitism','11 dead','anti-semitic'],
                     'Vegas':["paddock","mandalay bay","route 91 harvest","las vegas","aldean","concert","mesquite",
                              "hotel","lombardo",'country music event','music festival','64-year-old man','58 people',
                              '500 injured','killed 58','500 injured','59 people','injured 500','59 deaths','58 dead'],
                     'SanBernadino':['syed','rizwan','farook','tashfeen','malik','SUV',"inland regional center",
                                     "san bernardino","redlands","christmas party","public health department","bomb",
                                     '14 people','14 injured','14 lives','fourteen people','multiple shooters','gunmen',
                                     '14 dead','14 victims','developmental disabilities','disabled','17 injured',
                                     'san bernadino','public facility','sheriff deputy','muslim','islamic'],
                     'Houston':['conley','harris county','ex-girlfriend','houston','saturday','valerie jackson','window'],
                     'Odessa':['saturday','midland','odessa','seth','aaron','ator','west texas','traffic stop',
                               'white van','movie theater','random','5 people','21 injured','zack owens','rifle',
                               'midland-odessa','seven people','eight deaths'],
                     'Bogue':['lincoln county','brookhaven','killed eight people','2017 mississippi'],
                     'DC':['washington navy yard','alexis','monday','contractor','12 people','navy yard',
                           'military facility','armed military','12 victims','13 dead'],
                     'Boulder':['king soopers','boulder','ahmad al','aliwi','al-issa','alissa','arrested','9mm handgun',
                                'table mesa drive','eric talley','boulder police','in custody','ten people','10 people',
                                'grocery store','21-year-old','10 dead'],
                     'VirginiaBeach':['virginia beach','dewayne','craddock','employee','nettleton','.45-caliber',
                                      'engineer','municipal','11 people','12 people','cervera','police chief',
                                      '.45 handgun','12 dead']
                    }

min_keywords_length = min([len(x) for x in shooting_keywords.values()])
max_keywords_length = max([len(x) for x in shooting_keywords.values()])

In [None]:
def dict_check(text, keywords, min_num_of_occur):
    num_of_occur = sum([1 if k in text else 0 for k in keywords])
    return True if num_of_occur >= min_num_of_occur else False

def gen_csv_path(label, month, prefix='../NewsMedia'):
    return f"{prefix}/v3_gs_{label}_{month}_month.csv"

def gen_new_csv_path(label, month, prefix='.'):
    return f"{prefix}/v3_gs_{label}_{month}_month_w_dict_method.csv"

def series_similarity(series1, series2):
    equality = series1.to_numpy() == series2.to_numpy()
    return sum(equality) / len(equality)

In [None]:
for label, keywords in shooting_keywords.items():
    for month in['first', 'second', 'third']:
        csv_path = gen_csv_path(label, month)
        
        if not Path(csv_path).is_file():
            continue
        
        df = pd.read_csv(csv_path)
        
        for num_of_occur in range(1, max_keywords_length + 1):
            tag = f'dict_method_{num_of_occur}'
            df[tag] = False
            
            for idx, row in df.iterrows():
                df.at[idx, tag] = dict_check(row.text, keywords, num_of_occur)
    
        new_csv_path = gen_new_csv_path(label, month, dict_method_comparison_dir)
        df.to_csv(new_csv_path)

In [None]:
fig = go.Figure()
xs = list(range(1, max_keywords_length + 1))

max_sample_size = 0
for label, keywords in shooting_keywords.items():
    for month in['first', 'second', 'third']:
        csv_path = gen_new_csv_path(label, month, dict_method_comparison_dir)
        
        if not Path(csv_path).is_file():
            continue

        df = pd.read_csv(csv_path)
        
        max_sample_size = max(max_sample_size, len(df))
        
weighted_means, counts = [0] * max_keywords_length, [0] * max_keywords_length

for label, keywords in shooting_keywords.items():
    for month in['first', 'second', 'third']:
        csv_path = gen_new_csv_path(label, month, dict_method_comparison_dir)
        
        if not Path(csv_path).is_file():
            continue

        df = pd.read_csv(csv_path)
        base = df['verified_article'].map({'related': True, 'not-related': False})

        ys = []
        for i, num_of_occur in enumerate(xs):
            
            if i >= len(shooting_keywords[label]):
                continue
    
            tag = f'dict_method_{num_of_occur}'
            similarity = series_similarity(base, df[tag])
            
            ys.append(similarity)
            weighted_means[i] += len(base) * similarity
            counts[i] += len(base)

        fig.add_trace(go.Scatter(
            x=xs, y=ys, showlegend=True if label == 'Pittsburgh' and month == 'first' else False,
            name=f"{label}-{month}",
            mode='lines', line=dict(color="rgb({0}, {0}, {0})".format(155 - 155 * len(base) / max_sample_size), width=1),
            opacity=0.25,
        ))
        
fig.add_trace(go.Scatter(
    x=xs, y=[m/c if c > 0 else 0 for m, c in zip(weighted_means, counts)],
    name='Mean (Weighted by the sample size)',
    mode='lines', line=dict(color='blue', width=2),
))

width, height = 600, 500
        
fig.update_layout(
    width=width, height=height,
    xaxis=dict(title='Number of Keywords', tickmode='array', tickvals=xs, mirror=True, range=(1, 15)),
    yaxis=dict(title='Similarity', mirror=True),
    template='simple_white',
    legend=dict(xanchor='center', x=0.5, yanchor='bottom', y=1.05, orientation='h')
)
fig.show()

# fig.write_image(dict_method_comparison_dir+'/keyword_threshhold.png', width=width, height=height, scale=3)