In [33]:
import csv
import spacy
import re

# Carregando modelo de linguagem do SpaCy para inglês
nlp = spacy.load('en_core_web_sm')

def ajusta_paises(frase):
    substituicoes = {
        'Australian':'Australia',
        'Chinese': 'China',
        'China’s': 'China',
        'EU': 'Europe',
        'French': 'France',
        'German':'Germany',
        'Iranian': 'Iran',
        'Iraqi':'Iraq',
        'Israeli': 'Israel',
        'Japanese':'Japan',
        'North Korean': 'North Korea',
        'North American': 'United States and Canada',
        'Russian': 'Russia',
        'U.S.': 'United States',
        'the United States': 'United States',
        'The United States': 'United States',
        'Ukrainian': 'Ukrain',
        'Vietnamese':'Vietnam'
    }
    for antigo, novo in substituicoes.items():
        frase = frase.replace(antigo, novo)
    return frase

def retira_paises(sentence):
    # Aplicando NER na frase
    doc = nlp(sentence)
    # Extraindo países identificados
    paises = [ent.text for ent in doc.ents if ent.label_ == 'GPE' and ent.text != 'United States' 
              and ent.text != 'China' and ent.text != 'Russia' and ent.text != 'North Korea'
             and ent.text != 'Iran']
    # Adicionando termos específicos à lista de países
    paises_add = ["Europe", "Persian Gulf", "United States","China","Russia","Middle Eastern",'world',
                 'international waters','UAE','Middle East','North Korea','Latin America','Southeast Asia',
                 'NATO','Central Asia','Eastern Europe','British','Iran']
    for termo in paises_add:
        if termo in sentence:
            paises.append(termo)
    # Removendo duplicatas da lista
    paises = list(set(paises))
    paises.sort()
    return paises

def retira_estatal(frase):
    estatais = [#1. Estatais genéricas
                'government_agencies','government','Senator','defense organizations','diplomats','ministries','Ministry', 
                'President','military','grid systems', 'intelligence','election','Agency','agency','agencies','municipalities',
                'law enforcement agencies', 'defense','Foreign Ministry','Chancellor',
                #2. Estatais dos EUA
                 #2.1 Presidencial
                 'White House',
                 #2.2 Departamentos
                'Department of Health', 'Department of Defense', 'Department of Homeland Security','Department of Justice',
                'Treasury Department', 'grand jury','Office of Personnel Management', 'State Department','Department of Commerce',
                'OPM','DHS','Office of Personnel Management','Department of Labor','Justice Department',
                 #2.3 Agências
                'CIA','NSA','DISA','FBI','Agency for International Development','NASA',
                 #2.4 Estruturas de defesa
                'United States Cyber Command','Army','weapons systems','Air Force','DOD','CENT COM'
                 #2.5 Serviços
                'Postal Service','Census Bureau', 'Securities and Exchange Commission','Internal Revenue Service',
                #3. Estatais de outros países
                'Islamic Revolutionary Guardrooms','GRU','Navy','Unit 61398','PLA','Fizz ad-Din all-Assam',
                #4.Termos que conectam hackers a instituições estatais
                'state-sponsored hackers','linked hacking group','cyber espionage group','state hackers',
                'State-sponsored','linked hackers','military hackers',    
                ]
    palavras = frase.split()
    estatais_encontradas = []
    for estatal in estatais:
        if estatal in ' '.join(palavras):
            estatais_encontradas.append(estatal)
    estatais_encontradas.sort()
    estatais_encontradas = [item1 for item1 in estatais_encontradas if not any(item1 in item2 for item2 in estatais_encontradas if item1 != item2)]
    return estatais_encontradas
    
def retira_NaoEstatal(frase):
    naoEstatais = [#Grupos Hackers
                    'Hackers','hackers','hacker','group of hackers','hacking group','Lazarus Group', 'Shadow Brokers','Cybercriminals','cyber criminals',
                    'unknown hackers','cyber-intrusion campaign','Iran Cyber Army','insurgents',
                   #ONGs
                  'human rights activist','activist',
                  
                  #Empresas com base no Global Industry Classification Standard - GICS
                    # Disponível em: https://www.msci.com/documents/1296102/11185224/GICS+Methodology+2020.pdf
                   #1. Energia
                   'electric utility companies','electric utilities','energy companies','electric companies','energy','nuclear','energy facilities',
                   'energy company','electrical utilities','electrical utility','power plants',
                   #2. Materiais
                   'petrochemical','manufacturing facilities','Alcoa','oil','gas','gas pipeline companie','gas pipelines',
                   #20. Indústrias
                   'industry','industries','company','companies',
                   #201010. Indústrias aeroespaciais ou empresas de aviação
                   'aerospace industry','United Airlines','aviation','think_tanks','aerospace',
                   #201010. Indústrais de Defesa
                   'defense firms','defense industrial base', 'contractors','contractor',
                   #201030 - 
                   'construction', 'engineering','engineering companies',
                   #202010 - Serviços comerciais
                   'law firm',
                   #2030. Transporte
                   'Uber company',
                   #25. Bens de Consumo Discricionários
                   #2510
                   'automotive', 
                   #253010 Hotéis, restaurantes
                   'hotel',
                   #253020. Serviços de educação
                   'think_tank','universities','University','scientific research','research organizations',
                   #30. Bens de Consumo básicos
                   'Target supermarket',
                   #35. Saúde
                   'medical','healthcare','pharmaceutical','health insurance','health','hospital','vaccine development','Community Health',
                   #40. Financeiro
                   'bank','Bank','SWIFT','ATM','finance','Equifax company','companyWesat','company Wesat','United States Chamber of Commerce',
                   #45. Tecnologia da Informação
                   'Microsoft','AOL','Yahoo company','computer hardware','Google',
                   #500. Serviços de Comunicação
                   #5010. Serviços de Telecomunicações
                   'telecommunications company','Verizon','telecom companies','telecommunication companies','telecommunications',
                   #5020.Mídia e Entretenimento
                   'journalists','media','Sony Pictures Entertainment','Wikileaks','FIFA','United States Soccer Federation',
                   #10.Serviços Públicos
                   'critical infrastructure','water','utility companies','infrastructure companies',
                   #11. Imóveis
                  
                   #OUTROS
                   'citizens',      
                  ]    
    palavras = frase.split()    
    naoEstatal_encontrada = [naoEstatal for naoEstatal in naoEstatais if naoEstatal in ' '.join(palavras)]
    naoEstatal_encontrada.sort()    
    naoEstatal_encontrada = [item1 for item1 in naoEstatal_encontrada if not any(item1 in item2 for item2 in naoEstatal_encontrada if item1 != item2)]    
    return naoEstatal_encontrada

def retira_ferramentasCiber(frase):
    ferramentas = [#APT
        'APT', 'complex', 'advanced', 'persistent','persistent cyberattacks',
        #Acesso remoto
        'Remote Access','remote Access', 'remote control',
        #Backdoor
        'backdoor',
        #Botnet
        'Bot', 'robot','botnet',
        #Negação de serviço
        'DoS','DDoS', 'distributed', 'denial', 'traffic','denial-of-service','denial of service',
        #Spoof
        'DNS', 'spoof', 'redirect', 'domain', 'direction',
        #Engenharia social
        'Social engineering', 'fake',
        #Phishing
        'phishing','spear phishing','spear-phishing',
        #Ransomware
        'ransomware',
        #Zero-day
        'zero day', 'zero-day',
        #SQL Injection
        'SQL', 'injection','SQL injection',
        #Apagadores
        'wiper',
        #Cavalo de troia
        'trojan',
        #Spyware
        'spyware','spy','keylogger','screenlogger',
        #virus
        'worm','spread',
        #web
        'URL','websites',
        #Investigação
        'Forensic',
        ]
    palavras = frase.split()
    ferramentaCiber_encontrada = []
    for ferramenta in ferramentas:
        if ferramenta in ' '.join(palavras):
            ferramentaCiber_encontrada.append(ferramenta)
    ferramentaCiber_encontrada.sort()
    ferramentaCiber_encontrada = [item1 for item1 in ferramentaCiber_encontrada if not any(item1 in item2 for item2 in ferramentaCiber_encontrada if item1 != item2)]    
    return ferramentaCiber_encontrada

def retira_efeito(frase):
    efeitos = [#FÍSICO
                #Degradar ou Reduzir a performance
                'reduced physical performance', 'phisicly performance',
                #Destruir
                'destroyed', 'burned',
                #indisponibilizar
                'take down',
                #interromper
                'internet traffic','DoS','DDoS',
                #Acessar
                'flash memory','USB',
                #Acusação
                'indicted','indictment','charged','ascribed',
                #Lesão corporal
                'die', 'bodly injure',
                
                #DIGITAL
                #Comprometer (acesso não autorizado)
                'breach','breached', 'compromising',  'disruption','compromise','gain access','gain unauthorized access','disrupt',
                'gained access','penetrated','interfered','try to steal', 'attempt to steal', 'monitor','penetrate', 'intrusion',
                #Infectar
                'exploiting', 'backdoor', 'malware', 'SQL', 'injection','SQL injection',
                #vazamento
                'publish','data breach', 'leaking','exposing', 'release', 'publish',
                #Danificar / Indisponibilizar
                'botnet', 'DDoS','DoS', 'flood of traffic', 'network disrupted', 'disrupted their operations', 
                'growing volume of cyber activity', 'lock', 'shut down',
                #roubar dados
                'exfiltrate data','espionage','personal information','steal information','accessed data',
                'collect', 'eavesdrop','steal personal and financial data','exfiltrate','were taken', 'steal','stole',
                'obtained','download','gather',
                #Corromper
                'ransomware', 'crippled', 'rebuilt', 'loss registry', 'skew',
                
                #ECONOMICO
                #Interromper processo econômico
                'Economic process interrupted',
                #Multas regulatórias
                'fines','sanctions', 'sconce', 'amerce', 'mulct',
                #perdas financeiras ou de capital
                'stealing money', 'fraud','stolen money','dollars','intellectual property', 'commercial secrets', 
                'intellectual property',
                #Custo de resposta de RP
                'Public Relations', 'images costs',
                #Pagamento de extorsão
                'Payment', 'repayment',
                #Impactos negativos sobre o PIB
                'GDP',
        
                #PSICOLOGICO
                #Confusão/Frustração
                'fake news', 'false information', 'disinformation', 'skew', 
                #Mudanças negativas de percepção
                'perception change', 'sense modify',
                #Manipulação/influência
                'Trump campaign', 'Hilary campaign', 'political campaign', 'Democrats', 'Republicans',
                #Preocupação/ ansiedade
                'Intimidation', 'alert', 'warning',
                #estimulação
                'deterrent','stimulation',
                #Acreditação na tecnologia
                'tecnology believe', 'tecnology accreditation','science accreditation',
        
                #POLÍTICO / REPUTACIONAL
                #dano na percepção pública
                'government','municipality', 'municipalities', 'agency','agencies', 'ministery', 
                #dano nas relações internacionais
                'blame','accuse','false information about'
                #Escrutínio midiático
                'Wikileaks', 'media attention','New York Times','Washington Post',
                #Redução do status da resiliência ciber
                'cyber resilience',
                #Propaganda política
                'intimidation campaign', 'election',
                #atribuição
                'indicted','indictment','charged','ascribed','accuse',
                #Deterrência
                'deterrence','dissuasion',
        
                #SOCIAL/SOCIETAL
                #Interrupção de atividades corriqueiras
                'medical', 'utility companies','Uber',
                #Queda na moral internacional
                'false information',
                #Dano em serviços providos por infraestruturas críticas
                'critical infrastructure','restricted Internet access','infrastructure companies','electric utility companies',
                'electric utilities','energy companies','electric companies','energy','nuclear','energy facilities',
                'energy company','electrical utilities','electrical utility','power plants',
                #Mudança de opinião pública
                'information operations','election', 'public opinion', 'uprising', 'rebellion', 'insurgency',
                #Confiança no governo
                'bilateral agreement','trust in government',
                #Protecionismo
                'social proteccionism'
                ]
    palavras = frase.split()
    efeito_encontrado = []
    for efeito in efeitos:
        if efeito in ' '.join(palavras):
            efeito_encontrado.append(efeito)
    efeito_encontrado.sort()
    efeito_encontrado = [item1 for item1 in efeito_encontrado if not any(item1 in item2 for item2 in efeito_encontrado if item1 != item2)]    
    return efeito_encontrado

def retira_meios(frase):
    meios = [#Acessorios
        'Camera', 'decoder', 'USB', 'iot','flash memory',
        #Banco de Dados
        'Database', 'database', 'oracle',
        #Certificados
        'Certificate', 'certificate', 'factor','credential',
        #DNS
        'DNS', 'Direction','direction', 'redirection', 'domain',
        #Correio eletrônico
        'Mail','e-mail','email', 'gmail', 'account',
        #Mídias sociais
        'Social media','social media', 'Facebook', 'Twitter', 'Whatsapp', 'Telegram',
        #Redes
        'satellite','network','submarine cable',
        #Sistemas
        'computer system','servers','computer',
        #Websites
        'websites','URL', 'page', 'blog',
        #Software
        'software', 'application',
        #Telefônico
        'mobile phone','phone','smartphone', 'tablet',
    
    ]
    palavras = frase.split()
    meios_encontrados =[]
    for meio in meios:
        if meio in ' '.join(palavras):
            meios_encontrados.append(meio)
    meios_encontrados.sort()
    meios_encontrados = [item1 for item1 in meios_encontrados if not any(item1 in item2 for item2 in meios_encontrados if item1 != item2)]  
    return meios_encontrados
    
#Lista de modificações
def ajusta_termos(frase):
    frase = frase.replace('USB drives','flash memory')
    frase = frase.replace('The Johns Hopkins University’s Applied Physics Laboratory, which does classified research for the Department of Defense and NASA, took its unclassified networks offline after they were penetrated.',
                         'The U.S. Johns Hopkins University offline networks was targeted')
    frase = frase.replace('think tank', 'think_tank')
    frase = frase.replace('U.S. election.', 'national election.')
    frase = frase.replace('U.S. elections.', 'national elections.')
    frase = frase.replace('state election websites', 'U.S. state election websites')
    frase = frase.replace('had been scanning', 'scans')
    frase = frase.replace('were found to have been working to gain', 'gained')
    frase = frase.replace('was found to be targeting', 'targeted')
    frase = frase.replace('affected hospitals to revert', 'affected U.S. hospitals to revert')
    frase = frase.replace('COVID-19 vaccine development', 'COVID-19 vaccine development from Canada, the UK, and the U.S.')
    frase = frase.replace('. S.', ' U.S.')
    frase = frase.replace('hackers linked to the Chinese government of attempting to steal', 'hackers from Chinese government steal')
    frase = frase.replace('UK-US', 'UK and United States')
    frase = frase.replace('CIA', 'U.S. CIA')
    frase = frase.replace('NSA', 'U.S. NSA')
    frase = frase.replace('White House','U.S. White House')
    frase = frase.replace('Las Vegas','U.S. las vegas')
    frase = frase.replace('US Army','U.S. Army')
    frase = frase.replace('Department of Defense','U.S. Department of Defense')
    frase = frase.replace('DOD','U.S. DOD')
    frase = frase.replace('US Departments of Justice and Treasury','U.S. Department of Justice and Treasury Department')
    frase = frase.replace('US and UK','U.S. and U.K.')
    frase = frase.replace('western','U.S. and allieds')
    frase = frase.replace('American hotel chain','U.S. American hotel chain')
    frase = frase.replace('Navy contractors','U.S. Navy contractors')
    frase = frase.replace('agency’s files in 2016','SEC’s agency files in 2016')
    frase = frase.replace('SEC’s','United States Securities and Exchange Commission’s')
    frase = frase.replace('Huawei','China telecommunications company Huawei')
    frase = frase.replace('north Korean Hacker','North Korean hacker')
    frase = frase.replace('GRU','Russia GRU')
    frase = frase.replace('People’s LiberationArmy','PLA')
    frase = frase.replace('PLA', 'China PLA')
    frase = frase.replace('Unit 61398','China PLA Unit 61398')
    frase = frase.replace('US CyberCommand', 'U.S. CyberCommand')
    frase = frase.replace('OPM','U.S. OPM')
    frase = frase.replace('Sony Pictures Entertainment','U.S. Sony Pictures Entertainment')
    frase = frase.replace('IRS','U.S. Internal Revenue Service')
    frase = frase.replace('Office of Personnel Management','U.S. Office of Personnel Management')
    frase = frase.replace('US energy companies','U.S. energy companies')
    frase = frase.replace('Cyber Command','U.S. CyberCommand')
    frase = frase.replace('NASA','U.S. NASA')
    frase = frase.replace('Democratic candidates','U.S. Democratic candidates')
    frase = frase.replace('seeing a surge of attacks by Chinese hackers', 'Chinese hackers attacks')
    frase = frase.replace('of being involved', 'spies')
    frase = frase.replace('The U.S. Defense Information Systems Agency announced','According U.S. DISA,')
    frase = frase.replace('including the 2015 and 2016 attacks on Ukrainian critical infrastructure, the 2017NotPetya ransomware outbreak, election interference in the 2017 French elections, and others.','')
    frase = frase.replace('control of U.S. servers','control of U.S. NSA servers')
    frase = frase.replace('coronavirus vaccine','medical coronavirus vaccine')
    frase = frase.replace('government agencies','government_agencies')
    frase = frase.replace('impeachment debate.',' President impeachment debate.')
    frase = frase.replace('had used an Iranian hacking group’s tools and infrastructure','')
    frase = frase.replace('Trump','President Trump')
    frase = frase.replace('Irans living abroad.','Irans citizens')
    frase = frase.replace('were revealed to have conducted','conducted')
    frase = frase.replace('its intranet and internal','Huawei Chinese telecommunications company intranet and internal')
    frase = frase.replace('were revealed to have targeted','targeted')
    frase = frase.replace('cancer institutes','medical cancer institutes')
    frase = frase.replace('bank account numbers.','bank account numbers from U.S. bank Capital One.')
    frase = frase.replace('government networks were being','U.S. government networks were being')
    frase = frase.replace('announced it had launched','launched')
    frase = frase.replace('control missile and rocket launches','military control missile and rocket launches')
    frase = frase.replace('announced that it','')
    frase = frase.replace('it had exposed and helped dismantle an alleged','Iran had exposed and helped dismantle an alleged')
    frase = frase.replace('networks of electrical utilities','U.S. networks of electrical utilities')
    frase = frase.replace('affiliated with the','from')
    frase = frase.replace('Los Angeles County, California and Salt Lake County, Utah,','U.S.')
    frase = frase.replace('grid operators','electrical utility grid operators')
    frase = frase.replace('StateSecurity','State Security')
    frase = frase.replace('CyberCommand','Cyber Command')
    frase = frase.replace('companyWesat','company Wesat')
    frase = frase.replace('revealed that it had been','was')
    frase = frase.replace('with the 2016 breach of','attacked')
    frase = frase.replace('hackers from the U.S., Russia, and Ukraine','U.S., Russia, and Ukraine hackers')
    frase = frase.replace('and New Zealand, accused',' and New Zealand was attacked by ')
    frase = frase.replace(' impersonating U.S. StateDepartment',' impersonating USStateDepartment')
    frase = frase.replace('military and law enforcement','U.S. military and law enforcement')
    frase = frase.replace('calls he made','calls U.S. President Trump made')
    frase = frase.replace('ACA sign-up season.','U.S. ACA sign-up season from medical Centers for Medicare and Medicaid Services.')
    frase = frase.replace('email accounts of Senators','email accounts of U.S. Senators')
    frase = frase.replace('Starwood hotel','U.S. Starwood hotel')
    frase = frase.replace('Russia and Iran. The campaigns targeted','Russia and Iran targeted')
    frase = frase.replace('were found to be engaged','engaged')
    frase = frase.replace('GB','Gigabytes')
    frase = frase.replace('Foreign Affairs ministries of US allies','Foreign Affairs ministries of U.S. allies')
    frase = frase.replace('The North Korean hacking group responsible for the SWIFT attacks was found','The North Korean hacking group attacked SWIFT system')
    frase = frase.replace('city of Atlanta','U.S. city of Atlanta')
    frase = frase.replace('Baltimore’s','U.S. Baltimore’s')
    frase = frase.replace('US critical infrastructure','U.S. critical infrastructure')
    frase = frase.replace('US states prior to the 2016election.','U.S. states prior to the 2016 election')
    frase = frase.replace('US grand jury','U.S. grand jury')
    frase = frase.replace('Alcoa','U.S. Alcoa company')
    frase = frase.replace('Target','U.S. Target supermarket')
    frase = frase.replace('Uber discloses that it paid hackers $100,000','Hackers ask to U.S. Uber company to pay $100,000')
    frase = frase.replace('Yahoo','U.S. Yahoo company was targeted')
    frase = frase.replace('were found to have targeted US','targeted U.S.')
    frase = frase.replace('Credit monitoring firm Equifax disclosed a July data breach that revealed 143million people’s full names, social security numbers, birthdates, home addresses and driver’license numbers, as well as 209,000 credit card numbers.','U.S. Equifax company was targeted')
    frase = frase.replace('Lazarus Group','North Korea Lazarus Group')
    frase = frase.replace('AdultFriendFinder','U.S. AdultFriendFinder company')
    frase = frase.replace('Democratic National Committee','U.S. Democratic National Committee')
    frase = frase.replace('usingWikiLeaks','using WikiLeaks')
    frase = frase.replace('Pastebin','pastebin')
    frase = frase.replace('Maryland and Washington','U.S. Maryland and Washington')
    frase = frase.replace('Social Security','U.S. Social Security')
    frase = frase.replace('The Internal Revenue Service (IRS) announced that a breach of its systems in','Russia did a fraud operation compromising 700k U.S. American taxpayers')
    frase = frase.replace('Israel’surveillance drones','Israel ’surveillance drones')
    frase = frase.replace('Obama administration officials','U.S. obama government')
    frase = frase.replace('A spear phishing attack on the Joint Chiefs of Staff unclassified email servers resulted in the system being shut down for 11 days while cyber experts rebuilt the network, affecting the work of roughly 4,000 military and civilian personnel. Officials believe that Russia is responsible for the intrusion, which occurred sometime around July 25, although China has not been ruled outs the perpetrator',
                         'Russia or maybe China compromised the U.S. Joint Chiefs of Staff unclassified email servers networks by a spear phishing')
    frase = frase.replace('United Airlines','U.S. United Airlines')
    frase = frase.replace('United Airlines revealed that its computer systems', 'United Airlines company computer systems')
    frase = frase.replace('was revealed to have been','was')
    frase = frase.replace('StateDepartment’s','State Department ’s')
    frase = frase.replace('Community Health Systems disclosed that suspected Chinese hackers infiltrated its','Chinese hacker infiltrated in U.S. Community Health Systems')
    frase = frase.replace('The contractor responsible','Hackers compromise the contractor responsible')
    frase = frase.replace('The U.S. Navy says that Iran hacked into unclassified networks.','Iran hacked the U.S. Navy unclassified networks')
    frase = frase.replace('experience','was targeted by')
    frase = frase.replace('North Korea blames the United States and South Korea for','United States and South Korea targeted North Korea for')
    frase = frase.replace('Iran’s Fizz ad-Din all-Assam','Iran Fizz ad-Din all-Assam')
    frase = frase.replace('American infrastructure companies','U.S. American infrastructure companies')
    frase = frase.replace('cyber criminals targeted23','China cyber criminals targeted 23')
    frase = frase.replace('23 gas pipeline companies','23 U.S. gas pipeline companies')
    frase = frase.replace('F-35 Joint Strike Fighters','NATO and U.S. F-35 Joint Strike Fighters')
    frase = frase.replace('espionage operation. Unidentified hackers also targeted the President','espionage operation, including the president')
    frase = frase.replace('FIFA President','FIFA president')
    frase = frase.replace('Google reported a ','Chinese ')
    frase = frase.replace('Gmail','Google Gmail')
    frase = frase.replace('Twitter','U.S. Twitter')
    frase = frase.replace('Defense mentioned that a defense contractor','Defense mentioned that a U.S. defense contractor')
    frase = frase.replace('“Iranian Cyber Army”','Iran Cyber Army')
    frase = frase.replace('. Previously, the Iran Cyber Army', ' the Iran Cyber Army')
    frase = frase.replace('Google announced that a sophisticated attack had penetrated its networks, along with the networks of more than 30 other US companies',
                         'China targeted Google and other 30 U.S. companies, collecting techonology, gain access to activist Gmail accounts and Google Gaea password management system')
    frase = frase.replace('was convicted of stealing','stealed')
    frase = frase.replace('HUD Tenenbaum','Israeli hacker Ehud Tenenbaum')
    frase = frase.replace('was indicted on charges that between 2006 and 2008, he','')
    frase = frase.replace('South Korea accused North Korea','SK accused North Korea')
    frase = frase.replace(', which does classified research for the U.S. U.S. Department of Defense and U.S. NASA,','')
    frase = frase.replace('the U.S. power grid to cyberattack also highlighted was the intrusions into F-35 databases by unknown foreign intruders.',
                         'The U.S. power grid and military F-35 database was targeted by foreign intruders')
    frase = frase.replace('Barack Obama','U.S. Presidential Barack Obama')
    frase = frase.replace('hackers successfully stole','China PLA hackers successfully stole')
    frase = frase.replace('that American, European, and Japanese companies were experiencing','U.S. , European, and Japanese companies were targeted')
    frase = frase.replace('Contractors employed by DHS and U.S. DOD had their networks hacked backdoors into agency systems.',
                         'U.S. DOD, DHS and contractors were hacked by backdoors')
    frase = frase.replace('Secretary of Defense’s','U.S. DOD Secretary of Defense’s')
    frase = frase.replace('a U.S. defense contractor was hacked and 24,000 files from the U.S. DOD were stolen.','hackers stolen a U.S. DOD defense contractor')
    frase = frase.replace('the US conducted cyber espionage against Chinese targets','the U.S. NSA conducted cyber espionage against Chinese targets')
    frase = frase.replace('The Justice Department indicted','The U.S. Justice Department indicted')
    frase = frase.replace('The FBI charged','The U.S. FBI charged')
    frase = frase.replace('ChancellorMerkel’s','Chancellor Merkel ’s')
    frase = frase.replace('The New York Times, Wall Street Journal, Washington Post, and Bloomberg News','The U.S. media news')
    frase = frase.replace('US','U.S.')
    return frase

def remover_exclusoes_anteriores(frase_original):
    # Lista de palavras para procurar
    exclusao = ["Security revealed that", "officials revealed that", "announced that", "announces that", "warned that", 
                "Systems sustained", "CISA revealed that","it believed","finding authorized", "2019 operation by", 
                "officials accused", "officials reported", "360 accused" ,"found that","Huawei accused","Capital One reveals",
               "received a report","an alert warning that","issued a warning that","reported that",
               "access to the Internet Research Agency, ", 'Security researchers reveal that',
               "A spokesperson for China’s Foreign Ministry responded to accusations that ",
               "The UK  believed",'The U.S. Securities and Exchange Commission charged a group of',
               'U.S. Navy officials report that ','Secretary of State Mike Pompeo confirmed that',
               'Security researchers report that','U.S. agencies warned President President Trump that that',
               'multiple Senate offices of attempts by','Researchers report that ','sponsored by groups in ',
               'reveal that a campaign by ','Microsoft reveals that ','would withdraw from the Iran nuclear agreement, security firms reported increases in',
               'joint warning that','alert to warn of ','Homeland Security confirmed that','reports warn of ',
               'Reports surface that ','Press reports say that the US','Cybersecurity researchers revealed a growing cyber-espionage campaign originating in ',
               'An intelligence report revealed a ','The U.S. Director of National Intelligence and Department of Homeland Security jointly identified',
               'Forensic evidence points to','Israel revealed an operation by the ','Dutch security firm Fox-IT identified',
               'Cybersecurity researchers uncovered a ','The Pentagon revealed that','U.S. officials report that ','CrowdStrike reported that ',
               'Federal prosecutors announce ','Press reports based on Snowden leaks reveal',
               'reveals documents showing among other things that','DHS reports that the ','DHS says that between December 2011 and June 2012, ',
               'U.S. NSA Director General Keith Alexander said that ','The U.S. Department of Homeland Security issued amber alerts warning of a ',
               'NASA’s Inspector General reported that ','Media reports say that ','According to a major U.S. news source, ',
               'In a speech unveiling the U.S. Department of Defense’s cyber strategy, the Deputy Secretary of Defense mentioned that',
               'twenty incidents in which ','British Foreign Minister William Hague reported attacks by ','Iranian political message',
               'Google announced that a','Wall Street Journal articles laid out the increasing vulnerability of ']    
    # Criando uma expressão regular que corresponda a qualquer uma das palavras na lista
    pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, exclusao)) + r')\b', flags=re.IGNORECASE)    
    # Procurando a correspondência na frase original
    match = pattern.search(frase_original)    
    # Verificando se uma correspondência foi encontrada
    if match:
        # Criando uma nova frase a partir do ponto após a correspondência encontrada
        nova_frase = frase_original[match.end():]
        #print("Nova frase após remover lista de pré-exclusão:", nova_frase)
        return nova_frase
    else:
        #print("Nenhuma palavra de exclusão encontrada na frase.")
        return frase_original

def remover_exclusoes_posteriores(frase_original):
    #Lista de palavras para exclusões posteriores a frase principal
    exclusao_pos = ["in retaliation FORTRAN", "researching the North Korean nuclear","target companies in the media",
                   "by declaring that the United States was an “empire of hacking,” citing 2013 leaks about the NSA’s Prism program.",
                   "more than 100 organizations across government, IT, social media, academia, and more",
                   'from wire and bank fraud to obstruction of justice and conspiracy to steal trade secrets',
                   'in the weeks after the 2018 midterm elections','affecting Atlanta’s government earlier in 2018',
                   'to hack into U.S. aerospace companies and steal information','to deter them from interfering in the 2018 midterm elections',
                   'of hacking against organizations including FIFA','allegedly involved in the 2014','critical of Russia',
                   'against the Democratic Party in advance','to conduct cyberattacksagainst the','with ties to the South ',
                   'online efforts to interfere','that caused billions of','Security researchers found strong evidence',
                   'both involved with fugitive','in a campaign bearing resemblance','using Russian tools',
                   'The malicious payloads delivered through','over the September 2014 U.S. Yahoo','credentials another data',
                   'The company’CIO claimed the attack was perpetrated by a state-sponsored actor.','believed to be related to the arrest',
                   'The report, citing','The first resulted in the loss of','results in a $50 million loss, which the',
                   'Forensic data suggests the probes originated','Another attack at the Joint Propulsion on Laboratory',
                   'Hackers had access to everything in Commerce’s','Google attributes the effort to China',
                    'The goal of the penetrations','Tenenbaum was known for hacking','The cable said that at least some attacks originated from',
                   'as part of a larger series of attacks to access','to prevent interference in the upcoming','ahead of the national election',
                   'This may have been part of a larger campaign that included']
    # Criando uma expressão regular que corresponda a qualquer uma das palavras na lista
    pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, exclusao_pos)) + r')\b', flags=re.IGNORECASE)
    # Procurando a correspondência na frase original
    match = pattern.search(frase_original)
    # Verificando se uma correspondência foi encontrada
    if match:
        # Criando uma nova frase até o ponto da correspondência encontrada
        nova_frase = frase_original[:match.start()]
        #print("Nova frase após remover lista de exclusão:", sentence)
        return nova_frase
    else:
        #print("Nenhuma palavra de exclusão encontrada na frase.")
        return frase_original


def classifica_voz(sentence):
    # Usando SpaCy para análise gramatical
    doc_spacy = nlp(sentence)
   
    # Encontrando o token com dep_ igual a "ROOT"
    root_tokens = [token for token in doc_spacy if token.dep_ == "ROOT"]
    
    # Verificando a posição do primeiro token ROOT
    if root_tokens:
        root_token = root_tokens[0]
        #print("ROOT", root_token.text)
        # Lista de palavras passivas
        palavras_passivas = ['suffered','exposed','suffers']        
        # Verificando se o ROOT está na lista de palavras passivas
        if root_token.text.lower() in palavras_passivas:
            return "Passiva"
        # Verificando se o ROOT está na forma passiva
        elif root_token.dep_ in ["auxpass", "nsubjpass"]:
            return "Passiva"
        # Verificando os dependentes do ROOT para indicadores adicionais de voz passiva
        for child in root_token.children:
            if child.dep_ == "auxpass" or child.dep_ == "nsubjpass":
                return "Passiva"
    # Se não encontrou um token ROOT ou se o ROOT não está na forma passiva
    return "Ativa"

def encontrar_root(sentence):
    # Aplicando análise sintática
    doc = nlp(sentence)
    # Encontrando o token com dep_ igual a "ROOT"
    root_token = [token for token in doc if token.dep_ == "ROOT"]
    # Verificando se há um token ROOT
    if root_token:
        return root_token[0].text
    else:
        return "Não há ROOT."

def classifica_acoesCiber(paises_atacantes,estatais_atacantes,naoEstatais_atacantes,
                         paises_atacados, estatais_atacadas,naoEstatais_atacados,efeito):
    acaoCibernetica = ""

    espionagem = ['exfiltrate data','espionage','personal information','steal information','accessed data',
                'collect', 'eavesdrop','steal personal and financial data','exfiltrate','were taken', 'steal','stole',
                'obtained']
    indiciamento = ['indicted','indictment','charged','ascribed',]
    for palavra in indiciamento:
        if palavra in efeito:
            acaoCibernetica = "Não é uma ação cibernética"
            break
    else:
        if len(estatais_atacantes) >= 1:
            if len(estatais_atacadas) >= 1:
                for palavra in espionagem:
                    if palavra in efeito:
                        acaoCibernetica = "Inteligência Cibernética"
                        break
                else:
                    acaoCibernetica = "Guerra Cibernética"
            else:
                acaoCibernetica = "Ataque Cibernético"
        elif len(naoEstatais_atacantes) >= 1:
            if len(naoEstatais_atacados) >= 1:
                acaoCibernetica = "Crime Cibernético"
            else:
                acaoCibernetica = "Ataque Cibernético"
        else:
            acaoCibernetica = "Indefinido"
    return acaoCibernetica

with open('bdEUA.csv', 'r', encoding='utf-8') as entradaCSV:
    with open('baseSeparada.csv', 'w', newline='', encoding='utf-8') as saidaCSV:
        #Escritor é o atributo para escrever o arquivo baseOrganizada
        escritor = csv.writer(saidaCSV)
        #Cabeçalho do arquivo baseOrganizada
        escritor.writerow(['ID','ID_Anterior','Ano','Mes','Frase Original','Frase Modificada','Tipo_voz','root',
                           'Oração Atacante','Ajuste atacante', 'Países_Atacantes','Estatais Atacantes','Não Estatais atacantes',
                           'Oração Atacado','Ajusta Atacado', 'Países_Atacados','Estatais atacadas','Não estatais atacadas',
                          'Ferramenta','Meios','Efeito','Ação Cibernética'])
        #Leitor lê o arquivo de entrada bdEUA
        leitor_csv = csv.reader(entradaCSV)
        # Pula a primeira linha (cabeçalho), senão aparecem 2 cabeçalhos idênticos
        next(leitor_csv)
        #Linhas que aparecem o nome EUA, mas que não tem efeito
        excluir = [14, 21, 37, 46, 83,89,95,99,105,106,113,120,127,128,129,139,150,152,174,195,199,237,240,241]
        #Cada linha do leitor, será escrito
        i=1
        for linha in leitor_csv:
            if int(linha[0]) in excluir:
                pass
            else:
                idAnterior = linha[0]
                ano = linha[1]
                mes = linha[2]
                descricao = linha[3]
                sentence_original = linha[3]
                sentence_ajustada = ajusta_termos(sentence_original)
                sentence = remover_exclusoes_anteriores(sentence_ajustada)
                sentence = remover_exclusoes_posteriores(sentence)
                voz = classifica_voz(sentence)
                root = encontrar_root(sentence)
                ferramenta = retira_ferramentasCiber(sentence_ajustada)
                meios = retira_meios(sentence_ajustada)
                efeito = retira_efeito(sentence_ajustada)
                doc_spacy = nlp(sentence)
            
                # Encontrando os tokens com dep_ igual a "ROOT"
                root_tokens = [token for token in doc_spacy if token.dep_ == "ROOT"]
                # Verificando a posição do primeiro token ROOT
                if root_tokens:
                    root_position = root_tokens[0].i
                    # Quebrando a frase com base na posição do token ROOT
                    if voz == "Passiva":
                        oracaoAtacante = doc_spacy[root_position + 1:]
                        oracaoAtacado = doc_spacy[:root_position + 1]
                    else:
                        oracaoAtacante = doc_spacy[:root_position + 1]
                        oracaoAtacado = doc_spacy[root_position + 1:]
                    ajustaAtacante = ajusta_paises(oracaoAtacante.text)
                    ajustaAtacado = ajusta_paises(oracaoAtacado.text)
                    paises_atacantes = retira_paises(ajustaAtacante)
                    paises_atacados = retira_paises(ajustaAtacado)
                    estatais_atacantes = retira_estatal(ajustaAtacante)
                    estatais_atacadas = retira_estatal(ajustaAtacado)
                    naoEstatais_atacantes = retira_NaoEstatal(ajustaAtacante)
                    naoEstatais_atacados = retira_NaoEstatal(ajustaAtacado)
                acoesCiber = classifica_acoesCiber(paises_atacantes,estatais_atacantes,naoEstatais_atacantes,paises_atacados, estatais_atacadas,naoEstatais_atacados,efeito)


                insercao = [i,idAnterior,ano,mes,descricao,sentence_ajustada,voz,root,
                            oracaoAtacante,ajustaAtacante,paises_atacantes,estatais_atacantes,naoEstatais_atacantes,
                            oracaoAtacado,ajustaAtacado,paises_atacados,estatais_atacadas,naoEstatais_atacados,
                            ferramenta,meios,efeito,acoesCiber]
                escritor.writerow(insercao)
                i +=1

# Vendo o resultado
with open('baseSeparada.csv', 'r', encoding='utf-8') as mostra:
    leitor = csv.reader(mostra)
    for linhas in leitor:
        print(linhas)

            

['ID', 'ID_Anterior', 'Ano', 'Mes', 'Frase Original', 'Frase Modificada', 'Tipo_voz', 'root', 'Oração Atacante', 'Ajuste atacante', 'Países_Atacantes', 'Estatais Atacantes', 'Não Estatais atacantes', 'Oração Atacado', 'Ajusta Atacado', 'Países_Atacados', 'Estatais atacadas', 'Não estatais atacadas', 'Ferramenta', 'Meios', 'Efeito', 'Ação Cibernética']
['1', '1', '2020', 'December', 'North Korean hackers targeted U.S. pharmaceutical companies Johnson Johnson and Novavax, both working on experimental vaccines, in an attempt to obtain information on COVID-19.', 'North Korean hackers targeted U.S. pharmaceutical companies Johnson Johnson and Novavax, both working on experimental vaccines, in an attempt to obtain information on COVID-19.', 'Ativa', 'targeted', 'North Korean hackers targeted', 'North Korea hackers targeted', "['North Korea']", '[]', "['hackers']", 'U.S. pharmaceutical companies Johnson Johnson and Novavax, both working on experimental vaccines, in an attempt to obtain inform

In [30]:
with open('baseSeparada.csv', 'r', encoding='utf-8') as mostra:
    leitor = csv.reader(mostra)
    for linhas in leitor:
        pais_atacante = linhas[10]
        estatal_atacante = linhas[11]
        naoEstatal_atacante = linhas[12]
        print("Pais: ",pais_atacante," Estatal Atacante:",estatal_atacante, "Não estatal Atacante: ", naoEstatal_atacante)



Pais:  Países_Atacantes  Estatal Atacante: Estatais Atacantes Não estatal Atacante:  Não Estatais atacantes
Pais:  ['North Korea']  Estatal Atacante: [] Não estatal Atacante:  ['hackers']
Pais:  ['Russia']  Estatal Atacante: [] Não estatal Atacante:  ['hackers']
Pais:  []  Estatal Atacante: [] Não estatal Atacante:  ['hacking group']
Pais:  ['United States']  Estatal Atacante: ['NSA', 'United States Cyber Command'] Não estatal Atacante:  []
Pais:  ['Iran']  Estatal Atacante: [] Não estatal Atacante:  ['hackers']
Pais:  ['China']  Estatal Atacante: ['state-sponsored hackers'] Não estatal Atacante:  ['hackers']
Pais:  ['North Korea']  Estatal Atacante: [] Não estatal Atacante:  ['hacking group']
Pais:  ['Russia']  Estatal Atacante: [] Não estatal Atacante:  ['hacking group']
Pais:  ['Iran']  Estatal Atacante: [] Não estatal Atacante:  ['hacking group']
Pais:  ['China']  Estatal Atacante: ['government'] Não estatal Atacante:  ['hackers']
Pais:  ['United States']  Estatal Atacante: [] Não 