# To Validate tweets

In [None]:
import pandas as pd
import numpy as np
import random
import copy
import glob
from pathlib import Path

random.seed(33)

In [None]:
def addCol_takeSample(df):
    '''
    parameter: DataFrame
    Returns: Two dataframes:
                        - with two additional columns, verified_tweet: if not Nan, it belongs to the gold standard subset.
                                                       mentioned_shooting: if not Nan, the article corresponds to another shooting.
                        - It takes 2% of the articles as sample to be be verified. 
                                                       if the 2% is less than 20 tweets, it either returns all the tweets or 20 tweets.
                                                                            
    '''
    df['verified_tweet'] = np.nan
    df['mentioned_shooting'] = np.nan
  
    sample = df.sample(frac=0.01, random_state=33)
        
    return df, sample



def validate_tweets(df, df_sample, label):

    df_update = copy.deepcopy(df)
    df_sample_update = copy.deepcopy(df_sample)
    j=0
    for i, row in df_sample.iterrows():
        j+=1
        print(j, label)
        print(row.date)
        print(row.tweet)
#         print(row['0'])
        reply = input()
        
        if reply == 'j': 
            print(' The tweet IS related to this shooting.')
            df_update.at[i, ['verified_tweet']] = 'related'
            df_sample_update.at[i, ['verified_tweet']] = 'related'
            print('-----')
#             also_related = input()
#             if also_related == 'aj':
#                 about = input()
#                 print(f'The tweet is about the shooting : {about}')
#                 df_update.at[i, ['mentioned_shooting']] = f'{about}'
#                 df_sample_update.at[i, ['mentioned_shooting']] = f'{about}'
#                 print('----------')
                
        if reply == 'm':
            print('The tweet IS NOT related to the shooting.')
            print('----------')
            
            about = input()
            print(f'The tweet IS about the shooting : {about}')
            df_update.at[i, ['verified_tweet']] = 'not-related'
            df_sample_update.at[i, ['verified_tweet']] = 'not-related'
            df_update.at[i, ['mentioned_shooting']] = f'{about}'
            df_sample_update.at[i, ['mentioned_shooting']] = f'{about}'
            print('----------')
            
        if reply == 'k':
            print('The tweet IS NOT related to the shooting, no other shooting mentioned')
            df_update.at[i, ['verified_tweet']] = 'not-related'
            df_sample_update.at[i, ['verified_tweet']] = 'not-related'
            print('----------')
        
    return df_update, df_sample_update

In [None]:
for csv_path in glob.glob('./Dict_*.csv'):
    df_updated_path = './Dict_updated_csv/' + csv_path[2:-4] + '_updated.csv'
    df_sample_updated_path = './Dict_updated_csv/' + csv_path[2:-4] + '_sample.csv'
    
    if Path(df_updated_path).is_file():
        continue


    try:
        csv = pd.read_csv(csv_path)
    except:
        print(csv_path)
        continue
    df, df_sample = addCol_takeSample(csv)
    
    df_updated, df_sample_updated = validate_tweets(df, df_sample, csv_path[2:-4])

    df_updated.to_csv(df_updated_path, sep=',', encoding='utf-8', index=False)
    df_sample_updated.to_csv(df_sample_updated_path, sep=',', encoding='utf-8', index=False) 


In [None]:
bag_of_words = general_phrases = [
    "shooting", "tragedy", "condolences", "thoughts", "prayers", "thoughts prayers","murder", "killing", "shooter", "shooters",
    "armed gunman", "deepest condolences", "victims", "killed", "injured", "families", "heart", "shot",
    "tragic", "enoughisenough", "guncontrol", "gunviolence", "guncontrol", "mass murder", 'grieve', 'gun control', 'heart broken', '2a'
]

dictionary = {
    'Plano':["spencer hight","meredith hight","plano","caleb edwards","deffner","rushin",'hight','estranged husband', 'north texas'],
    'Pittsburgh':["pittsburgh","synagogue","bowers","tree life","squirrel hill", "anti-semitism",'pittsburghsynagogue','treeoflife','treeoflifesynagogue','showupforshabbat', 'pittsburghshooting', 'pittsburghstrong'],
    'Las_Vegas':["paddock", "vegas","las vegas","lombardo",'country music event','music festival','vegasstrong', '1Ooctober', 'vegasstrong','lasvegasstrong','lasvegasshooting', 'prayforvegas'],
    'SanBernardino':["san bernardino",'bernadino', 'sbstrong', 'prayforsanbernardino', 'sanbernardino'],
    'Houston':['harris county', 'gilliland', 'valerie jackson', 'dewayne jackson', 'david ray conley', 'conley'],
    'Odessa':['midland','odessa', 'midland-odessa','west texas','midland-odessa', 'odessastrong', 'odessashooting', 'odessastrong', 'westtexasstrong'],
    'Bogue_Chitto':['bogue chitto','lincoln county','brookhaven', 'godbolt', 'durr'],
    'Washington':['washington navy yard', 'navy yard', 'navyyard', 'navyyardshooting'],
    'Boulder':['boulder','grocery store', 'arvada', 'king soopers', 'boulderstrong', 'boulderproud'],
    'Virginia':['virginia beach','nettleton','princess anne', 'municipal center', 'virginiabeach', 'cityofvabeach', 'vbremembers', 'vbstrong', 'loveforvb']
}

In [None]:
def dict_check(text, keywords, general_keywords):
    num_of_k_occur = sum([1 if k in text else 0 for k in keywords])
    num_of_gk_occur = sum([1 if k in text else 0 for k in general_keywords])
    return num_of_k_occur + num_of_gk_occur

def gen_csv_path(label, prefix='.'):
    return f"{prefix}/Twitter_{label}_unfiltered.csv"

def gen_new_csv_path(label, prefix='./Filtered_tweets_csv/'):
    return f"{prefix}/Twitter_{label}_filtered.csv"    

In [None]:
# for label, keywords in dictionary.items():
#     csv_path = gen_csv_path(label)
    
#     csv = pd.read_csv(csv_path)
#     csv['dict_method_n_occurance'] = 0
    
#     for i, tweet in csv.iterrows():
#         csv.at[i, ['dict_method_n_occurance']] = dict_check(tweet.tweet, keywords, general_phrases)
    
#     new_csv = csv.loc[csv['dict_method_n_occurance'] > 0].reset_index()
    
#     print(f'Saving {label} csv file...')
#     new_csv_path = gen_new_csv_path(label)
#     new_csv.to_csv(new_csv_path)

In [None]:
# for csv_path in glob.glob('./Filtered_tweets_csv/Twitter_*.csv'):
#     df_updated_path = './Filtered_tweets_csv/validated_tweets/' + csv_path[22:-4] + '_updated.csv'
#     df_sample_updated_path = './Filtered_tweets_csv/validated_tweets/' + csv_path[22:-4] + '_sample.csv'
    
#     if Path(df_updated_path).is_file():
#         continue
        
#     csv = pd.read_csv(csv_path)

#     df, df_sample = addCol_takeSample(csv)
    
#     df_updated, df_sample_updated = validate_tweets(df, df_sample, csv_path[22:-4])

#     df_updated.to_csv(df_updated_path, sep=',', encoding='utf-8', index=False)
#     df_sample_updated.to_csv(df_sample_updated_path, sep=',', encoding='utf-8', index=False) 