# Prepocessing GDELT Project

## Import libraries

In [1]:
!pip install validators
import validators

import numpy as np
import pandas as pd
pd.options.display.max_columns = 100

import requests
from bs4 import BeautifulSoup
import re
from multiprocessing import Pool
import pandas as pd

import pandas as pd
import numpy as np
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

import datetime
import time



## Scrapping

### URL : masterfilelist.txt

In [2]:
def masterfilelist(start_date, end_date):

    response = requests.get("http://data.gdeltproject.org/gdeltv2/masterfilelist.txt")
    content = response.content.decode("utf-8") 
    l = content.split('\n')[-1000:]

    liste = list()
    for i in l: liste.append(i.split(" ")[-1])

    df = pd.DataFrame(liste, columns=['url'])
    df['date_str'] = df['url'].apply(lambda x : x.split("/")[-1].split(".")[0][0:12])
    df = df.iloc[:df.shape[0]-1,:]
    df["date"] = pd.to_datetime(df["date_str"], format='%Y%m%d%H%M')

    start_datem = datetime.datetime.strptime(start_date, "%Y-%m-%d %H:%M:%S")
    end_datem = datetime.datetime.strptime(end_date, "%Y-%m-%d %H:%M:%S")
    df = df.loc[(df['date'] >= start_datem) & (df['date'] <= end_datem)]
    
    df['type_csv'] = df['url'].apply(lambda x : x.lower().split(".csv")[0].split(".")[-1])

    df['id'] = df['date_str']+'_'+df['type_csv']
    
    df = df.drop(columns=['date'])
    
    return df

### URL : masterfilelist-translation.txt

In [3]:
def masterfilelist_translation(start_date, end_date):

    response = requests.get("http://data.gdeltproject.org/gdeltv2/masterfilelist-translation.txt")
    content = response.content.decode("utf-8") 
    l = content.split('\n')[-1000:]

    liste = list()
    for i in l: liste.append(i.split(" ")[-1])

    df = pd.DataFrame(liste, columns=['url_translation'])
    df['date_str_translation'] = df['url_translation'].apply(lambda x : x.split("/")[-1].split(".")[0][0:12])
    df = df.iloc[:df.shape[0]-1,:]
    df["date"] = pd.to_datetime(df["date_str_translation"], format='%Y%m%d%H%M')

    start_datem = datetime.datetime.strptime(start_date, "%Y-%m-%d %H:%M:%S")
    end_datem = datetime.datetime.strptime(end_date, "%Y-%m-%d %H:%M:%S")
    df = df.loc[(df['date'] >= start_datem) & (df['date'] <= end_datem)]
    
    df['type_csv_translation'] = df['url_translation'].apply(lambda x : '_'.join(x.lower().split(".csv")[0].split(".")[-2:]))

    df['type_csv'] = df['url_translation'].apply(lambda x : x.lower().split(".csv")[0].split(".")[-1])

    df['id'] = df['date_str_translation']+'_'+df['type_csv']

    df = df.drop(columns=['type_csv', 'date'])
    
    return df

### Vérification de l'url

In [4]:
def verify_url(u):
    if validators.url(u) == True:
        return True
    else : 
        return False

### Merge masterfile.txt and masterfile_translation.txt

In [5]:
def merge_table(df, df_translation):
    
    # - left join des tableaux
    # - Première séléction : Supprimons les lignes où des NaN apparait
    # - Vérification URL
    
    result = df.merge(df_translation, on='id', how='left').dropna(axis='rows')
    result['work'] = result['url'].apply(lambda x : verify_url(x))
    result['work_translation'] = result['url_translation'].apply(lambda x : verify_url(x))
    return result

### Clean dataset

In [6]:
def clean_dataset(df):
    
    dk = df.groupby('date_str').count()[['id']]
    liste = dk[dk.id < 3].index.tolist()
    for item in liste:
        df = df.loc[result['date_str']!=item]

    liste = df.loc[df['work']==False]['date_str'].unique().tolist()
    for item in liste:
        df = df.loc[df['date_str']!=item]

    liste = df.loc[df['work_translation']==False]['date_str'].unique().tolist()
    for item in liste:
        df = df.loc[result['date_str']!=item]
            
    return df

### Fusion des tables

In [7]:
def concat_table(result):
    
    # Séparation des données de base et de translation ET concaténation 
    
    df_base = result[['url', 'type_csv']]

    df_translation = result[['url_translation', 'type_csv_translation']]

    df_translation = df_translation.rename(columns={'url_translation': "url", 'type_csv_translation': "type_csv"})

    final = pd.concat([df_base, df_translation])
    
    return final

### Lecture des zips

In [8]:
def read_zip(final):
    
    export               = final.loc[final['type_csv'] == 'export', 'url']
    mentions             = final.loc[final['type_csv'] == 'mentions', 'url']
    gkg                  = final.loc[final['type_csv'] == 'gkg', 'url']
    translation_export   = final.loc[final['type_csv'] == 'translation_export', 'url']
    translation_mentions = final.loc[final['type_csv'] == 'translation_mentions', 'url']
    translation_gkg      = final.loc[final['type_csv'] == 'translation_gkg', 'url']
    
    df_export               = list()
    df_mentions             = list()
    df_gkg                  = list()
    df_translation_export   = list()
    df_translation_mentions = list()
    df_translation_gkg      = list()
    
    for i in export.tolist():
        url = urlopen(i) 
        k = i.split("/")[-1].split(".zip")[0]
        zipfile = ZipFile(BytesIO(url.read()))
        FFdata = pd.read_csv(zipfile.open(k), header=None,on_bad_lines='skip', sep="\t", engine='python', encoding = 'latin-1')
        df_export.append(FFdata)
        
    for i in mentions.tolist():
        url = urlopen(i) 
        k = i.split("/")[-1].split(".zip")[0]
        zipfile = ZipFile(BytesIO(url.read()))
        FFdata = pd.read_csv(zipfile.open(k), header=None,on_bad_lines='skip', sep="\t", engine='python', encoding = 'latin-1')
        df_mentions.append(FFdata)
        
    for i in gkg.tolist():
        url = urlopen(i) 
        k = i.split("/")[-1].split(".zip")[0]
        zipfile = ZipFile(BytesIO(url.read()))
        FFdata = pd.read_csv(zipfile.open(k), header=None,on_bad_lines='skip', sep="\t", engine='python', encoding = 'latin-1')
        df_gkg.append(FFdata)
        
    for i in translation_export.tolist():
        url = urlopen(i) 
        k = i.split("/")[-1].split(".zip")[0]
        zipfile = ZipFile(BytesIO(url.read()))
        FFdata = pd.read_csv(zipfile.open(k), header=None,on_bad_lines='skip', sep="\t", engine='python', encoding = 'latin-1')
        df_translation_export.append(FFdata)
        
    for i in translation_mentions.tolist():
        url = urlopen(i) 
        k = i.split("/")[-1].split(".zip")[0]
        zipfile = ZipFile(BytesIO(url.read()))
        FFdata = pd.read_csv(zipfile.open(k), header=None,on_bad_lines='skip', sep="\t", engine='python', encoding = 'latin-1')
        df_translation_mentions.append(FFdata)
        
    for i in translation_gkg.tolist():
        url = urlopen(i) 
        k = i.split("/")[-1].split(".zip")[0]
        zipfile = ZipFile(BytesIO(url.read()))
        FFdata = pd.read_csv(zipfile.open(k), header=None,on_bad_lines='skip', sep="\t", engine='python', encoding = 'latin-1')
        df_translation_gkg.append(FFdata)
        
    export = pd.concat(df_export)
    mentions = pd.concat(df_mentions)
    gkg = pd.concat(df_gkg)

    export_translation = pd.concat(df_translation_export)
    mentions_translation = pd.concat(df_translation_mentions)
    gkg_translation = pd.concat(df_translation_gkg)
        
    return export, mentions, gkg, export_translation, mentions_translation, gkg_translation

### Execution des fonctions

In [9]:
start_time = time.time()

print("\n#### SCRAPPING... #####\n")

df = masterfilelist('2022-01-01 23:00:00', '2022-01-31 23:00:00')

df_translation = masterfilelist_translation('2022-01-01 23:00:00', '2022-01-31 23:00:00')

print("\n-------- %s seconde --------" % (time.time() - start_time))



print("\n\n#### MERGING... #####\n")

result = merge_table(df, df_translation)

print("\n-------- %s seconde --------" % (time.time() - start_time))


print("\n\n#### CLEANSING... #####\n")

result = clean_dataset(result)

print("\n-------- %s seconde --------" % (time.time() - start_time))



# print("\n\n#### SELECTING URL... #####\n")

# result = second_select(result)

# result = select_url_by_datetime(result, '2015-02-18 23:00:00', '2015-02-19 06:00:00')

# print("\n-------- %s seconde --------" % (time.time() - start_time))



print("\n\n#### CONCATENATING... #####\n")

final = concat_table(result)

print("\n-------- %s seconde --------" % (time.time() - start_time))


print("\n\n#### READING & EXPORTING TO CSV... #####\n")

export, mentions, gkg, translation_export, translation_mentions, translation_gkg = read_zip(final)

print("-------- %s seconde --------" % (time.time() - start_time))


#### SCRAPPING... #####


-------- 55.826200008392334 seconde --------


#### MERGING... #####


-------- 55.83414912223816 seconde --------


#### CLEANSING... #####


-------- 55.837141036987305 seconde --------


#### CONCATENATING... #####


-------- 55.83913564682007 seconde --------


#### READING & EXPORTING TO CSV... #####

-------- 423.516056060791 seconde --------


### On visualise les dataframes

In [10]:
export

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60
0,1025848292,20210131,202101,2021,2021.0849,GOV,PRESIDENT,,,,,,GOV,,,USA,UNITED STATES,USA,,,,,,,,1,20,20,2,1,3.0,4,1,4,-4.605263,2,"North Carolina, United States",US,USNC,,35.6411,-79.843100,NC,2,"North Carolina, United States",US,USNC,,35.6411,-79.843100,NC,2,"North Carolina, United States",US,USNC,,35.6411,-79.843100,NC,20220131103000,https://www.carolinajournal.com/opinion-articl...
1,1025848293,20210131,202101,2021,2021.0849,MED,SPOKESMAN,,,,,,MED,,,JUDJUD,DISTRICT COURT,,,,,,JUD,JUD,,0,14,14,1,1,0.0,2,1,2,0.305810,3,"White House, District of Columbia, United States",US,USDC,,38.8951,-77.036400,531871,2,"South Carolina, United States",US,USSC,,33.8191,-80.906600,SC,3,"White House, District of Columbia, United States",US,USDC,,38.8951,-77.036400,531871,20220131103000,https://www.usatoday.com/story/news/politics/2...
2,1025848294,20210131,202101,2021,2021.0849,USAMED,UNITED STATES,USA,,,,,MED,,,JUDJUD,DISTRICT COURT,,,,,,JUD,JUD,,0,14,14,1,1,0.0,2,1,2,0.305810,3,"White House, District of Columbia, United States",US,USDC,,38.8951,-77.036400,531871,3,"White House, District of Columbia, United States",US,USDC,,38.8951,-77.036400,531871,3,"White House, District of Columbia, United States",US,USDC,,38.8951,-77.036400,531871,20220131103000,https://www.usatoday.com/story/news/politics/2...
3,1025848295,20210131,202101,2021,2021.0849,USAMED,UNITED STATES,USA,,,,,MED,,,JUDJUD,DISTRICT COURT,,,,,,JUD,JUD,,0,14,14,1,1,0.0,2,1,2,0.305810,3,"White House, District of Columbia, United States",US,USDC,,38.8951,-77.036400,531871,2,"South Carolina, United States",US,USSC,,33.8191,-80.906600,SC,3,"White House, District of Columbia, United States",US,USDC,,38.8951,-77.036400,531871,20220131103000,https://www.usatoday.com/story/news/politics/2...
4,1025848296,20210131,202101,2021,2021.0849,USAMED,UNITED STATES,USA,,,,,MED,,,JUDJUD,DISTRICT COURT,,,,,,JUD,JUD,,0,14,14,1,1,0.0,1,1,1,0.305810,2,"Texas, United States",US,USTX,,31.1060,-97.647500,TX,2,"Texas, United States",US,USTX,,31.1060,-97.647500,TX,2,"Texas, United States",US,USTX,,31.1060,-97.647500,TX,20220131103000,https://www.usatoday.com/story/news/politics/2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997,1025953611,20220131,202201,2022,2022.0849,YEMGOV,YEMENI,YEM,,,,,GOV,,,JOR,JORDANIAN,JOR,,,,,,,,1,51,51,5,1,3.4,12,1,12,3.781513,1,Yemen,YM,YM,,15.5000,47.500000,YM,4,"Amman, (JO11), Jordan",JO,JO11,36728,31.9500,35.933300,-970362,1,Yemen,YM,YM,,15.5000,47.500000,YM,20220131230000,http://www.jordantimes.com/news/local/jordan-y...
998,1025953612,20220131,202201,2022,2022.0849,ran,RANA,,,ran,,,,,,JOR,JORDAN,JOR,,,,,,,,0,10,10,1,1,0.0,3,1,3,6.318083,4,"Amman, (JO11), Jordan",JO,JO11,36728,31.9500,35.933300,-970362,4,"Amman, (JO11), Jordan",JO,JO11,36728,31.9500,35.933300,-970362,4,"Amman, (JO11), Jordan",JO,JO11,36728,31.9500,35.933300,-970362,20220131230000,http://jordantimes.com/news/local/kings-vision...
999,1025953613,20220131,202201,2022,2022.0849,znd,AZERI,,,znd,,,,,,BUS,COMPANIES,,,,,,BUS,,,1,193,193,19,4,-10.0,6,1,6,-4.592423,4,"Azeri, Ida-Virumaa, Estonia",EN,EN03,15925,59.4506,26.867500,-2621534,4,"Glasgow, Glasgow City, United Kingdom",UK,UKV2,40176,55.8333,-4.250000,-2597039,4,"Glasgow, Glasgow City, United Kingdom",UK,UKV2,40176,55.8333,-4.250000,-2597039,20220131230000,https://www.msn.com/en-xl/news/other/court-app...
1000,1025953614,20220131,202201,2022,2022.0849,znd,AZERI,,,znd,,,,,,EST,ESTONIA,EST,,,,,,,,1,192,192,19,4,-9.5,4,1,4,-4.592423,4,"London, London, City of, United Kingdom",UK,UKH9,40110,51.5000,-0.116667,-2601889,4,"London, London, City of, United Kingdom",UK,UKH9,40110,51.5000,-0.116667,-2601889,4,"London, London, City of, United Kingdom",UK,UKH9,40110,51.5000,-0.116667,-2601889,20220131230000,https://www.msn.com/en-xl/news/other/court-app...


In [11]:
mentions

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1025840133,20220131091500,20220131103000,1,saharasamay.com,http://www.saharasamay.com/world-news/67663948...,14,2093,2119,2158,0,10,2924,-4.065041,,
1,1025840134,20220131091500,20220131103000,1,saharasamay.com,http://www.saharasamay.com/world-news/67663948...,14,2093,2119,2158,0,10,2924,-4.065041,,
2,967145097,20210131001500,20220131103000,1,mirror.co.uk,https://www.mirror.co.uk/3am/celebrity-news/sp...,8,1861,-1,1876,0,40,2936,2.803738,,
3,967223391,20210131170000,20220131103000,1,carolinajournal.com,https://www.carolinajournal.com/opinion-articl...,3,1734,-1,1771,1,10,4575,-4.605263,,
4,967177421,20210131080000,20220131103000,1,phys.org,https://phys.org/news/2022-01-explores-tempera...,9,3569,3651,3619,1,100,6811,1.797040,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3802,1025939176,20220131210000,20220131230000,1,whbl.com,https://whbl.com/2022/01/31/south-africa-scrap...,1,75,106,123,1,60,560,-6.382979,,
3803,1025953612,20220131230000,20220131230000,1,jordantimes.com,http://jordantimes.com/news/local/kings-vision...,10,1831,1877,1868,1,30,2907,6.318083,,
3804,1025953613,20220131230000,20220131230000,1,msn.com,https://www.msn.com/en-xl/news/other/court-app...,3,1191,1232,1226,1,60,5413,-4.592423,,
3805,1025953614,20220131230000,20220131230000,1,msn.com,https://www.msn.com/en-xl/news/other/court-app...,3,1191,1206,1150,0,40,5413,-4.592423,,


In [12]:
gkg

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26
0,20220131103000-0,20220131103000,1,cbc.ca,https://www.cbc.ca/news/canada/edmonton/edmont...,,,REL_ANTISEMITISM;MEDIA_MSM;PROTEST;URBAN;TAX_F...,"GENERAL_GOVERNMENT,1591;EPU_POLICY_GOVERNMENT,...","1#Canada#CA#CA#60#-96#CA;4#Ottawa, Ontario, Ca...","4#Calgary, Alberta, Canada#CA#CA01#12549#51.08...",amarjeet sohi;michael cooper;cathy heron;duane...,"Amarjeet Sohi,1884;Michael Cooper,208;Michael ...",mount royal university in calgary,"Mount Royal University In Calgary,2478","-4.06189555125725,1.1605415860735,5.2224371373...",,"wc:480,c12.1:28,c12.10:42,c12.12:20,c12.13:15,...",https://i.cbc.ca/1.4113389.1643590906!/fileIma...,,https://pic.twitter.com/OAWwz1rko9;,https://youtube.com/user/CBCtv;,1465|36||small number of unsavoury characters,"Amarjeet Sohi,1925;Cathy Heron,1958;Mount Roya...",,,<PAGE_LINKS>https://twitter.com/Cooper4SAE/sta...
1,20220131103000-1,20220131103000,1,marketwatch.com,https://www.marketwatch.com/story/sensyne-heal...,,,GENERAL_HEALTH;MEDICAL;USPEC_POLICY1;EPU_UNCER...,"GENERAL_HEALTH,31;MEDICAL,31;USPEC_POLICY1,425...",,,kyle morris,"Kyle Morris,14;Kyle Morris,756",sensyne health,"Sensyne Health,31","-1.38888888888889,2.77777777777778,4.166666666...",,"wc:108,c1.2:1,c12.1:4,c12.10:9,c12.12:5,c12.13...",,,,,,"Kyle Morris,850","1000000,pounds,240;8000000,a year earlier,284;...",,<PAGE_PRECISEPUBTIMESTAMP>20220131094400</PAGE...
2,20220131103000-2,20220131103000,1,familylawweek.co.uk,https://www.familylawweek.co.uk/site.aspx?i=ed...,,,USPEC_POLITICS_GENERAL1;WB_696_PUBLIC_SECTOR_M...,"WB_845_LEGAL_AND_REGULATORY_FRAMEWORK,1217;WB_...",,,,,ministry of justice;party parliamentary group ...,"Ministry Of Justice,1771;Ministry Of Justice,2...","2.7072758037225,4.73773265651438,2.03045685279...",1#0#0#2012#1188;1#0#0#2012#2223,"wc:530,c1.3:1,c12.1:38,c12.10:69,c12.12:9,c12....",,,,,,"All Party Parliamentary Group,125;Kinship Care...","3,quarters of kinship carers,495;3,kinship car...",,<PAGE_LINKS>https://frg.org.uk/policy-and-camp...
3,20220131103000-3,20220131103000,1,somersetcountygazette.co.uk,https://www.somersetcountygazette.co.uk/news/1...,,,KILL;SOC_GENERALCRIME;TRIAL;WB_2433_CONFLICT_A...,"WB_566_ENVIRONMENT_AND_NATURAL_RESOURCES,2063;...","4#Pont-De-Beauvoisin, RhÃ´Alpes, France#FR#FRB...",1#France#FR#FR##46#2#FR#86;1#France#FR#FR##46#...,arthur noyer,"Arthur Noyer,2351",associated press,"Associated Press,695","-6.81114551083591,0.619195046439629,7.43034055...",4#2#18#0#3773,"wc:584,c12.1:26,c12.10:38,c12.12:24,c12.13:11,...",https://www.somersetcountygazette.co.uk/resour...,,,,3819|33||a drop of blood found in the boot,"Maelys De Araujo,32;Joachim De Araujo,1058;Art...","6,months later,54;2,guests who had asked,524;4...",,<PAGE_PRECISEPUBTIMESTAMP>20220131100600</PAGE...
4,20220131103000-4,20220131103000,1,citizen-times.com,https://www.citizen-times.com/story/news/local...,,,TAX_ECON_PRICE;TAX_FNCACT;TAX_FNCACT_CITIZEN;G...,"ECON_HOUSING_PRICES,167;WB_904_HOUSING_MARKETS...","3#Miami, Florida, United States#US#USFL#25.774...","3#Madison County, North Carolina, United State...",patrick bowen;john boyle asheville,"Patrick Bowen,5466;John Boyle Asheville,97",dogwood health trust in asheville;national pub...,"Dogwood Health Trust In Asheville,5641;Nationa...","-0.235294117647059,1.17647058823529,1.41176470...",,"wc:1153,c1.2:7,c12.1:59,c12.10:133,c12.11:3,c1...",https://www.gannett-cdn.com/presto/2021/09/24/...,,,,6986|69||the number of units that could potent...,"Asheville Citizen,113;North Carolina,296;Apart...","1,dollars ,452;10,largest North Carolina citie...",,"<PAGE_TITLE>Asheville, NC rents on the rise, m..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1630,20220131230000-1630,20220131230000,1,palmbeachpost.com,https://www.palmbeachpost.com/story/news/2022/...,,,UNGP_FORESTS_RIVERS_OCEANS;EPU_CATS_MIGRATION_...,"TAX_ETHNICITY_INDIAN,5803;NATURAL_DISASTER_FRE...","3#Miami, Florida, United States#US#USFL#25.774...","3#Alachua, Florida, United States#US#USFL#FL00...",louise pearson;kathy hillard dimpflmaier;lisa ...,"Louise Pearson,2671;Kathy Hillard Dimpflmaier,...",instagram;twitter;wildlife conservation commis...,"Instagram,68;Instagram,2449;Instagram,6150;Ins...","-1.04712041884817,1.42109199700823,2.468212415...",,"wc:1203,c1.1:2,c1.3:1,c12.1:94,c12.10:139,c12....",https://www.gannett-cdn.com/presto/2022/01/31/...,,https://pic.twitter.com/uE4zDOGHuC;https://ins...,,3596|132||This is a frozen iguana. In Florida ...,"Palm Beach,247;Cold Florida,275;Palm Beach,519...","32,degrees,420;20,inches including tail,2340;3...",,<PAGE_LINKS>https://twitter.com/Drew_Morris/st...
1631,20220131230000-1631,20220131230000,1,cosmopolitan.com,https://www.cosmopolitan.com/style-beauty/beau...,,,TAX_DISEASE;TAX_DISEASE_CANCER;WB_1406_DISEASE...,"TAX_DISEASE_CANCER,165;WB_1406_DISEASES,165;WB...",,,,,,,"-1.43884892086331,2.87769784172662,4.316546762...",,"wc:116,c12.1:15,c12.10:20,c12.12:9,c12.13:5,c1...",https://hips.hearstapps.com/hmg-prod.s3.amazon...,,,https://youtube.com/c/cosmopolitan?sub_confirm...,,,"21,best face,519;",,<PAGE_LINKS>https://www.cosmopolitan.com/style...
1632,20220131230000-1632,20220131230000,1,yahoo.com,https://news.yahoo.com/utahs-ingles-mri-shows-...,,,EPU_CATS_REGULATION;WB_1921_PRIVATE_SECTOR_DEV...,"MANMADE_DISASTER_IMPLIED,2591;MANMADE_DISASTER...",,,tobias koppers;yehuda katz;stefan penner;tom d...,"Tobias Koppers,29561;Yehuda Katz,904;Stefan Pe...",yahoo,"Yahoo,6724;Yahoo,25516;Yahoo,25539;Yahoo,25709...","-1.14793155728828,0.389863547758285,1.53779510...",1#0#0#2014#907,"wc:4479,c1.1:3,c1.2:16,c12.1:112,c12.10:167,c1...",https://s.yimg.com/ny/api/res/1.2/aPaeELnptuV_...,,,,1711|202||===Object prototype toString call ( ...,"Tom Dale,1078;Stefan Penner,1097;Jake Archibal...","90,fireImageBeacon,8449;",,<PAGE_ALTURL_AMP>https://news.yahoo.com/amphtm...
1633,20220131230000-1633,20220131230000,1,iheart.com,https://my100fm.iheart.com/content/2022-01-31-...,,,NATURAL_DISASTER;NATURAL_DISASTER_HEAVY_SNOW;N...,WB_1458_HEALTH_PROMOTION_AND_DISEASE_PREVENTIO...,,,,,mansfield corporation,"Mansfield Corporation,87","-2.96296296296296,2.96296296296296,5.925925925...",,"wc:114,c12.1:7,c12.10:10,c12.12:2,c12.13:3,c12...",https://i.iheart.com/v3/re/assets.getty/60622e...,,https://instagram.com/p/CZPOVGLrQKj;https://pi...,https://youtube.com/watch?v=ZU6kXopIfqQ;https:...,,"Mansfield Residents,240;Mansfield Codified Ord...",,,<PAGE_PRECISEPUBTIMESTAMP>20220131182800</PAGE...


In [13]:
translation_export

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60
0,1025850300,20210131,202101,2021,2021.0849,RUS,RUSSIA,RUS,,,,,,,,USA,UNITED STATES,USA,,,,,,,,0,113,113,11,3,-2.0,8,1,8,-2.403846,4,"Moscow, Moskva, Russia",RS,RS48,25106,55.752200,37.6156,-2960561,1,United States,US,US,,39.828175,-98.5795,US,4,"Moscow, Moskva, Russia",RS,RS48,25106,55.752200,37.6156,-2960561,20220131103000,https://www.wnp.pl/wiadomosci/fogiel-niebezpie...
1,1025850301,20220124,202201,2022,2022.0658,,,,,,,,,,,ESP,SARAGOSSA,ESP,,,,,,,,1,36,36,3,1,4.0,10,1,10,2.029664,0,,,,,,,,1,United States,US,US,,39.828175,-98.5795,US,1,United States,US,US,,39.828175,-98.5795,US,20220131103000,https://www.europapress.es/aragon/noticia-lleg...
2,1025850302,20220124,202201,2022,2022.0658,ESP,SARAGOSSA,ESP,,,,,,,,,,,,,,,,,,1,36,36,3,1,4.0,10,1,10,2.029664,1,United States,US,US,,39.828175,-98.5795,US,0,,,,,,,,1,United States,US,US,,39.828175,-98.5795,US,20220131103000,https://www.europapress.es/aragon/noticia-lleg...
3,1025850303,20220131,202201,2022,2022.0849,,,,,,,,,,,AFR,AFRICA,AFR,,,,,,,,1,20,20,2,1,3.0,10,1,10,2.676580,0,,,,,,,,1,Congo,CF,CF,,-1.000000,15.0000,CF,1,Congo,CF,CF,,-1.000000,15.0000,CF,20220131103000,https://www.lephareonline.net/muyaya-vos-recom...
4,1025850304,20220131,202201,2022,2022.0849,,,,,,,,,,,ARE,ABU DHABI,ARE,,,,,,,,0,42,42,4,1,1.9,10,1,10,-1.457726,0,,,,,,,,0,,,,,,,,0,,,,,,,,20220131103000,https://www.diariojaen.es/espana/juan-carlos-i...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428,1025954044,20220131,202201,2022,2022.0849,USAMED,ASSOCIATED PRESS,USA,,,,,MED,,,UKR,UKRAINIAN,UKR,,,,,,,,0,190,190,19,4,-10.0,8,1,8,-1.384615,4,"Kiev, Ukraine (general), Ukraine",UP,UP00,28554,50.433300,30.5167,-1044367,4,"Kiev, Ukraine (general), Ukraine",UP,UP00,28554,50.433300,30.5167,-1044367,4,"Kiev, Ukraine (general), Ukraine",UP,UP00,28554,50.433300,30.5167,-1044367,20220131230000,https://www.vesti.ru/article/2670780
429,1025954045,20220131,202201,2022,2022.0849,USAPRIGOV,PUERTO RICO,USA,,,,,GOV,,,,,,,,,,,,,1,51,51,5,1,3.4,10,1,10,-1.067616,1,Puerto Rico,RQ,RQ,,18.235900,-66.4838,RQ,0,,,,,,,,1,Puerto Rico,RQ,RQ,,18.235900,-66.4838,RQ,20220131230000,https://www.telemundopr.com/noticias/puerto-ri...
430,1025954046,20220131,202201,2022,2022.0849,VEN,VENEZUELA,VEN,,,,,,,,MIL,MILITARY,,,,,,MIL,,,1,36,36,3,1,4.0,4,1,4,2.242152,1,Venezuela,VE,VE,,8.000000,-66.0000,VE,1,Venezuela,VE,VE,,8.000000,-66.0000,VE,1,Venezuela,VE,VE,,8.000000,-66.0000,VE,20220131230000,http://www.laverdad.com/zulia/190687-abierta-c...
431,1025954047,20220131,202201,2022,2022.0849,VEN,VENEZUELAN,VEN,,,,,,,,OPP,POLITICAL PRISONER,,,,,,OPP,,,1,111,111,11,3,-2.0,10,1,10,-7.242340,4,"Carabobo, AnzoÃ¡gui, Venezuela",VE,VE02,31872,10.023300,-64.5973,-938440,4,"Carabobo, AnzoÃ¡gui, Venezuela",VE,VE02,31872,10.023300,-64.5973,-938440,4,"Carabobo, AnzoÃ¡gui, Venezuela",VE,VE02,31872,10.023300,-64.5973,-938440,20220131230000,https://www.elnacional.com/venezuela/denuncian...


In [14]:
translation_mentions

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,967171920,20210131064500,20220131103000,1,bnr.bg,https://bnr.bg/post/101593912,1,-1,233,253,1,100,942,1.257862,srclc:bul;eng:GT-BUL 1.0,
1,967171920,20210131064500,20220131103000,1,bnr.bg,https://bnr.bg/post/101593912,1,233,-1,253,1,100,942,1.257862,srclc:bul;eng:GT-BUL 1.0,
2,967161668,20210131033000,20220131103000,1,bnr.bg,https://bnr.bg/post/101593928/mae-ochakva-spad...,6,3214,-1,3222,1,100,3814,-0.445765,srclc:bul;eng:GT-BUL 1.0,
3,967145972,20210131000000,20220131103000,1,lindependant.fr,https://www.lindependant.fr/2022/01/31/meurtre...,8,748,-1,729,1,100,930,-11.764706,srclc:fra;eng:Moses 2.1.1 / MosesCore Europarl...,
4,967189655,20210131101500,20220131103000,1,securitylab.ru,https://www.securitylab.ru/news/529310.php,8,1491,-1,1506,1,100,4819,-2.017654,srclc:rus;eng:Moses 2.1.1 / MosesCore Europarl...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1116,1025804790,20220131011500,20220131230000,1,confirmado.com.ve,http://confirmado.com.ve/venezuela-recibe-un-m...,6,2067,2141,2098,0,20,2348,2.393617,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,
1117,1025804791,20220131011500,20220131230000,1,confirmado.com.ve,http://confirmado.com.ve/venezuela-recibe-un-m...,6,2067,2122,2078,1,60,2348,2.393617,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,
1118,1025954046,20220131230000,20220131230000,1,laverdad.com,http://www.laverdad.com/zulia/190687-abierta-c...,1,32,55,45,0,40,1368,2.242152,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,
1119,1025954047,20220131230000,20220131230000,1,elnacional.com,https://www.elnacional.com/venezuela/denuncian...,2,107,154,118,1,100,2200,-7.242340,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,


In [15]:
translation_gkg

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26
0,20220131103000-T0,2.022013e+13,1.0,sport.aktualne.cz,https://sport.aktualne.cz/fotbal/ceska-liga/sl...,,,,,"1##RB#RB###RB;4#Brussels, Bruxelles-Capitale, ...","4#Brussels, Bruxelles-Capitale, Belgium#BE#BE1...",rekonvalescenti hovorka,"Rekonvalescenti Hovorka,168",league slavia,"League Slavia,21","0,1.44404332129964,1.44404332129964,2.88808664...",,"wc:244,c1.2:3,c1.3:1,c1.4:1,c12.1:11,c12.10:26...",https://cdn.xsd.cz/original/e6fb98e9bafc3849bb...,,,,,"League Slavia,22;Rekonvalescenti Hovorka,176;S...","4,quarter slowed down On,1341;",srclc:ces;eng:Moses 2.1.1 / MosesCore Europarl...,<PAGE_TITLE>Slavia v gener&#xE1;lce na ligu z&...
1,20220131103000-T1,2.022013e+13,1.0,sport.cz,https://www.sport.cz/clanek/olympiada-zoh-2022...,,,,,"4#Liberec, LibereckÃ½, Czech Republic#EZ#EZ83#...",1#Russia#RS#RS##60#100#RS#418;1#Switzerland#SZ...,sochi hadamczik;peter forsbergovi,"Sochi Hadamczik,2676;Peter Forsbergovi,1007",olympics,"Olympics,2442;Olympics,2588;Olympics,3084;Olym...","0.820707070707071,2.52525252525253,1.704545454...",1#0#0#2012#2979;1#0#0#1971#3289;1#0#0#1994#339...,"wc:1474,c12.1:146,c12.10:148,c12.12:42,c12.13:...",https://d16-a.sdn.cz/d_16/c_img_QP_Y/h7bBBs.jp...,https://d16-a.sdn.cz/d_16/c_img_QO_X/liqaf.jpe...,,,,"Peter Forsbergovi,1039;Vancouverem Could,1191;...","40,battle with Switzerland,254;10,takes agains...",srclc:ces;eng:Moses 2.1.1 / MosesCore Europarl...,<PAGE_LINKS>http://www.pravo.cz</PAGE_LINKS><P...
2,20220131103000-T2,2.022013e+13,1.0,ceskenoviny.cz,https://www.ceskenoviny.cz/zpravy/rust-ekonomi...,,,USPEC_POLICY1;EPU_ECONOMY;EPU_ECONOMY_HISTORIC...,"TAX_ETHNICITY_CZECH,1673;TAX_WORLDLANGUAGES_CZ...",1#Germany#GM#GM#51.5#10.5#GM;1#Latvia#LG#LG#57...,1#Czech Republic#EZ#EZ##49.75#15#EZ#1682;1#Lat...,,,,,"-1.34228187919463,0,1.34228187919463,1.3422818...",,"wc:287,c1.2:7,c12.1:7,c12.10:19,c12.12:13,c12....",https://i3.cn.cz/14/1642583735_P2022011903260.jpg,,,https://youtube.com/user/CTKvideozpravy/feed;,,"Against The,1210;Czech Republic,1716","4,quarter slowed down on,32;4,quarter rose aga...",srclc:ces;eng:Moses 2.1.1 / MosesCore Europarl...,<PAGE_AUTHORS>&#x10C;TK</PAGE_AUTHORS><PAGE_TI...
3,20220131103000-T3,2.022013e+13,1.0,sedmicka.tyden.cz,https://sedmicka.tyden.cz/rubriky/souteze/sout...,,,WB_678_DIGITAL_GOVERNMENT;WB_694_BROADCAST_AND...,"BAN,231;WB_678_DIGITAL_GOVERNMENT,72;WB_678_DI...",,,,,,,"0,1.76991150442478,1.76991150442478,3.53982300...",,"wc:104,c12.1:4,c12.10:10,c12.12:3,c12.13:2,c12...",https://www.tyden.cz/obrazek/202201/61f783b16e...,,,,,,,srclc:ces;eng:Moses 2.1.1 / MosesCore Europarl...,<PAGE_LINKS>https://sedmicka.tyden.cz/rubriky/...
4,20220131103000-T4,2.022013e+13,1.0,tyden.cz,https://www.tyden.cz/rubriky/zdravi/invazivni-...,,,GENERAL_HEALTH;TAX_DISEASE;TAX_DISEASE_DISEASE...,"TAX_DISEASE_INFECTION,58;BAN,262;TAX_WORLDMAMM...",,,,,,,"-4.47761194029851,0.746268656716418,5.22388059...",,"wc:129,c12.1:8,c12.10:14,c12.12:6,c12.13:2,c12...",https://www.tyden.cz/obrazek/202201/61f7ab88b4...,,,,,,,srclc:ces;eng:Moses 2.1.1 / MosesCore Europarl...,<PAGE_LINKS>http://www.sabre.cz;http://www.vsh...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1408,20220131230000-T1408,2.022013e+13,1.0,vietgiaitri.com,https://vietgiaitri.com/smartphone-man-hinh-ga...,"AFFECT#2000000000##1#Vietnam, Republic Of#VM#V...","AFFECT#2000000000##1#Vietnam, Republic Of#VM#V...",TAX_ECON_PRICE;TAX_FNCACT;TAX_FNCACT_CHILDREN;...,"IDEOLOGY,5431;MEDIA_MSM,3219;TAX_ECON_PRICE,78...","1#Vietnam, Republic Of#VM#VM#16.166667#107.833...",1#Vietnam#VM#VM##16.166667#107.833333#VM#1169,,,,,"1.17860380779692,4.3517679057117,3.17316409791...",,"wc:1070,nwc:1280,c1.1:4,c1.3:2,c12.1:111,c12.1...",https://i.vietgiaitri.com/2022/2/1/smartphone-...,https://i.vietgiaitri.com/2022/1/28/cu-1-ngay-...,,https://youtube.com/c/VGTTV?sub_confirmation=1...,,"Pictured Phone,459;Fresh New,2622;Apple New,28...","9,quality brought a cover,2124;1000000000,USD ...",srclc:vie;eng:GT-VIE 1.0,<PAGE_LINKS>https://vietgiaitri.com/galaxy-z-f...
1409,20220131230000-T1409,2.022013e+13,1.0,vietgiaitri.com,https://vietgiaitri.com/the-gioi-da-ghi-nhan-t...,,,CRISISLEX_CRISISLEXREC;CRISISLEX_C03_WELLBEING...,"GENERAL_GOVERNMENT,2283;EPU_POLICY_GOVERNMENT,...","1#Vietnam, Republic Of#VM#VM#16.166667#107.833...",1#India#IN#IN##20#77#IN#1674;1#India#IN#IN##20...,europe europe;europe asia,"Europe Europe,822;Europe Europe,978;Europe Asi...",health india,"Health India,1892","-2.68817204301075,2.01612903225806,4.704301075...",,"wc:734,nwc:933,c1.3:1,c12.1:46,c12.10:106,c12....",https://t.vietgiaitri.com/2022/2/1/the-gioi-da...,https://i.vietgiaitri.com/2022/1/17/gioi-chuc-...,,https://youtube.com/c/VGTTV?sub_confirmation=1...,,"World Under,49;Pictured Staff,311;Hospital Pic...","1000000,World Under p statistics,32;1000000,th...",srclc:vie;eng:GT-VIE 1.0,<PAGE_LINKS>https://vietgiaitri.com/benh-nhan-...
1410,20220131230000-T1410,2.022013e+13,1.0,vietgiaitri.com,https://vietgiaitri.com/thuong-lai-dam-nuoc-ma...,WOUND#2000##0######;CRISISLEX_CRISISLEXREC#200...,WOUND#2000##0#######0;CRISISLEX_CRISISLEXREC#2...,WOUND;CRISISLEX_CRISISLEXREC;CRISISLEX_C03_WEL...,"TAX_ECON_PRICE,511;TAX_ECON_PRICE,1077;TAX_ECO...",,,,,english campaign,"English Campaign,1859","1.59340659340659,6.37362637362637,4.7802197802...",,"wc:1799,nwc:1834,c1.1:4,c1.3:1,c12.1:192,c12.1...",https://i.vietgiaitri.com/2022/2/1/thuong-lai-...,https://i.vietgiaitri.com/2022/1/31/giao-thua-...,,https://youtube.com/c/VGTTV?sub_confirmation=1...,,"Injured Drive,14;Flower New Year,67;Life Noon,...","2,English Campaign s traders,1558;1000000,copp...",srclc:vie;eng:GT-VIE 1.0,<PAGE_LINKS>https://vietgiaitri.com/ban-hoa-ke...
1411,20220131230000-T1411,2.022013e+13,1.0,vietgiaitri.com,https://vietgiaitri.com/thu-tuong-nhanh-chong-...,,,TAX_FNCACT;TAX_FNCACT_MINISTER;LEADER;TAX_FNCA...,"TAX_FNCACT_GUIDE,1083;TAX_FNCACT_CHILD,1904;TA...","4#Hanoi, Ha N?I, Vietnam, Republic Of#VM#VM44#...",1#Reunion#RE#RE##-21.1#55.6#RE#2973;1#Reunion#...,,,,,"3.31534309946029,6.32228218966847,3.0069390902...",,"wc:1275,nwc:1255,c1.1:1,c1.2:3,c12.1:104,c12.1...",https://t.vietgiaitri.com/2022/2/1/thu-tuong-n...,https://i.vietgiaitri.com/2022/1/22/chu-tich-n...,,https://youtube.com/c/VGTTV?sub_confirmation=1...,,"Prime Minister Fast,20;New Year Life Here,88;P...","1000000,people learning work,3915;1000000000,U...",srclc:vie;eng:GT-VIE 1.0,<PAGE_LINKS>https://vietgiaitri.com/bo-giao-th...


### Pre-processing base table

In [16]:
def rename_columns(export, mentions, gkg, translation_export, translation_mentions, translation_gkg):
    
    for i in range(export.shape[1]):
        export.rename({i: 'export_'+str(i)}, axis=1, inplace=True)
        
    for i in range(mentions.shape[1]):
        mentions.rename({i: 'mentions_'+str(i)}, axis=1, inplace=True)    
        
    for i in range(gkg.shape[1]):
        gkg.rename({i: 'gkg_'+str(i)}, axis=1, inplace=True)
        
    for i in range(translation_export.shape[1]):
        translation_export.rename({i: 'export_translation_'+str(i)}, axis=1, inplace=True)
        
    for i in range(translation_mentions.shape[1]):
        translation_mentions.rename({i: 'mentions_translation_'+str(i)}, axis=1, inplace=True)    
        
    for i in range(translation_gkg.shape[1]):
        translation_gkg.rename({i: 'gkg_translation_'+str(i)}, axis=1, inplace=True)            
    
    return export, mentions, gkg, translation_export, translation_mentions, translation_gkg

export, mentions, gkg, export_translation, mentions_translation, gkg_translation = rename_columns(export, mentions, gkg, translation_export, translation_mentions, translation_gkg)

In [17]:
export_translation

Unnamed: 0,export_translation_0,export_translation_1,export_translation_2,export_translation_3,export_translation_4,export_translation_5,export_translation_6,export_translation_7,export_translation_8,export_translation_9,export_translation_10,export_translation_11,export_translation_12,export_translation_13,export_translation_14,export_translation_15,export_translation_16,export_translation_17,export_translation_18,export_translation_19,export_translation_20,export_translation_21,export_translation_22,export_translation_23,export_translation_24,export_translation_25,export_translation_26,export_translation_27,export_translation_28,export_translation_29,export_translation_30,export_translation_31,export_translation_32,export_translation_33,export_translation_34,export_translation_35,export_translation_36,export_translation_37,export_translation_38,export_translation_39,export_translation_40,export_translation_41,export_translation_42,export_translation_43,export_translation_44,export_translation_45,export_translation_46,export_translation_47,export_translation_48,export_translation_49,export_translation_50,export_translation_51,export_translation_52,export_translation_53,export_translation_54,export_translation_55,export_translation_56,export_translation_57,export_translation_58,export_translation_59,export_translation_60
0,1025850300,20210131,202101,2021,2021.0849,RUS,RUSSIA,RUS,,,,,,,,USA,UNITED STATES,USA,,,,,,,,0,113,113,11,3,-2.0,8,1,8,-2.403846,4,"Moscow, Moskva, Russia",RS,RS48,25106,55.752200,37.6156,-2960561,1,United States,US,US,,39.828175,-98.5795,US,4,"Moscow, Moskva, Russia",RS,RS48,25106,55.752200,37.6156,-2960561,20220131103000,https://www.wnp.pl/wiadomosci/fogiel-niebezpie...
1,1025850301,20220124,202201,2022,2022.0658,,,,,,,,,,,ESP,SARAGOSSA,ESP,,,,,,,,1,36,36,3,1,4.0,10,1,10,2.029664,0,,,,,,,,1,United States,US,US,,39.828175,-98.5795,US,1,United States,US,US,,39.828175,-98.5795,US,20220131103000,https://www.europapress.es/aragon/noticia-lleg...
2,1025850302,20220124,202201,2022,2022.0658,ESP,SARAGOSSA,ESP,,,,,,,,,,,,,,,,,,1,36,36,3,1,4.0,10,1,10,2.029664,1,United States,US,US,,39.828175,-98.5795,US,0,,,,,,,,1,United States,US,US,,39.828175,-98.5795,US,20220131103000,https://www.europapress.es/aragon/noticia-lleg...
3,1025850303,20220131,202201,2022,2022.0849,,,,,,,,,,,AFR,AFRICA,AFR,,,,,,,,1,20,20,2,1,3.0,10,1,10,2.676580,0,,,,,,,,1,Congo,CF,CF,,-1.000000,15.0000,CF,1,Congo,CF,CF,,-1.000000,15.0000,CF,20220131103000,https://www.lephareonline.net/muyaya-vos-recom...
4,1025850304,20220131,202201,2022,2022.0849,,,,,,,,,,,ARE,ABU DHABI,ARE,,,,,,,,0,42,42,4,1,1.9,10,1,10,-1.457726,0,,,,,,,,0,,,,,,,,0,,,,,,,,20220131103000,https://www.diariojaen.es/espana/juan-carlos-i...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428,1025954044,20220131,202201,2022,2022.0849,USAMED,ASSOCIATED PRESS,USA,,,,,MED,,,UKR,UKRAINIAN,UKR,,,,,,,,0,190,190,19,4,-10.0,8,1,8,-1.384615,4,"Kiev, Ukraine (general), Ukraine",UP,UP00,28554,50.433300,30.5167,-1044367,4,"Kiev, Ukraine (general), Ukraine",UP,UP00,28554,50.433300,30.5167,-1044367,4,"Kiev, Ukraine (general), Ukraine",UP,UP00,28554,50.433300,30.5167,-1044367,20220131230000,https://www.vesti.ru/article/2670780
429,1025954045,20220131,202201,2022,2022.0849,USAPRIGOV,PUERTO RICO,USA,,,,,GOV,,,,,,,,,,,,,1,51,51,5,1,3.4,10,1,10,-1.067616,1,Puerto Rico,RQ,RQ,,18.235900,-66.4838,RQ,0,,,,,,,,1,Puerto Rico,RQ,RQ,,18.235900,-66.4838,RQ,20220131230000,https://www.telemundopr.com/noticias/puerto-ri...
430,1025954046,20220131,202201,2022,2022.0849,VEN,VENEZUELA,VEN,,,,,,,,MIL,MILITARY,,,,,,MIL,,,1,36,36,3,1,4.0,4,1,4,2.242152,1,Venezuela,VE,VE,,8.000000,-66.0000,VE,1,Venezuela,VE,VE,,8.000000,-66.0000,VE,1,Venezuela,VE,VE,,8.000000,-66.0000,VE,20220131230000,http://www.laverdad.com/zulia/190687-abierta-c...
431,1025954047,20220131,202201,2022,2022.0849,VEN,VENEZUELAN,VEN,,,,,,,,OPP,POLITICAL PRISONER,,,,,,OPP,,,1,111,111,11,3,-2.0,10,1,10,-7.242340,4,"Carabobo, AnzoÃ¡gui, Venezuela",VE,VE02,31872,10.023300,-64.5973,-938440,4,"Carabobo, AnzoÃ¡gui, Venezuela",VE,VE02,31872,10.023300,-64.5973,-938440,4,"Carabobo, AnzoÃ¡gui, Venezuela",VE,VE02,31872,10.023300,-64.5973,-938440,20220131230000,https://www.elnacional.com/venezuela/denuncian...


### Transform Date fiels into datetime 

In [18]:
def date_to_datime(export, mentions, gkg, export_translation, mentions_translation, gkg_translation):
    export["export_1"] = pd.to_datetime(export["export_1"], format='%Y%m%d')
    mentions["mentions_1"] = pd.to_datetime(mentions["mentions_1"], format='%Y%m%d%H%M%S')
    gkg["gkg_1"] = pd.to_datetime(gkg["gkg_1"], format='%Y%m%d%H%M%S')

    export_translation["export_translation_1"] = pd.to_datetime(export_translation["export_translation_1"], format='%Y%m%d')
    mentions_translation["mentions_translation_1"] = pd.to_datetime(mentions_translation["mentions_translation_1"], format='%Y%m%d%H%M%S')
    gkg_translation["gkg_translation_1"] = pd.to_datetime(gkg_translation["gkg_translation_1"], format='%Y%m%d%H%M%S')
    
    return export, mentions, gkg, export_translation, mentions_translation, gkg_translation

export, mentions, gkg, export_translation, mentions_translation, gkg_translation = date_to_datime(export, mentions, gkg, export_translation, mentions_translation, gkg_translation)

In [19]:
mentions_translation

Unnamed: 0,mentions_translation_0,mentions_translation_1,mentions_translation_2,mentions_translation_3,mentions_translation_4,mentions_translation_5,mentions_translation_6,mentions_translation_7,mentions_translation_8,mentions_translation_9,mentions_translation_10,mentions_translation_11,mentions_translation_12,mentions_translation_13,mentions_translation_14,mentions_translation_15
0,967171920,2021-01-31 06:45:00,20220131103000,1,bnr.bg,https://bnr.bg/post/101593912,1,-1,233,253,1,100,942,1.257862,srclc:bul;eng:GT-BUL 1.0,
1,967171920,2021-01-31 06:45:00,20220131103000,1,bnr.bg,https://bnr.bg/post/101593912,1,233,-1,253,1,100,942,1.257862,srclc:bul;eng:GT-BUL 1.0,
2,967161668,2021-01-31 03:30:00,20220131103000,1,bnr.bg,https://bnr.bg/post/101593928/mae-ochakva-spad...,6,3214,-1,3222,1,100,3814,-0.445765,srclc:bul;eng:GT-BUL 1.0,
3,967145972,2021-01-31 00:00:00,20220131103000,1,lindependant.fr,https://www.lindependant.fr/2022/01/31/meurtre...,8,748,-1,729,1,100,930,-11.764706,srclc:fra;eng:Moses 2.1.1 / MosesCore Europarl...,
4,967189655,2021-01-31 10:15:00,20220131103000,1,securitylab.ru,https://www.securitylab.ru/news/529310.php,8,1491,-1,1506,1,100,4819,-2.017654,srclc:rus;eng:Moses 2.1.1 / MosesCore Europarl...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1116,1025804790,2022-01-31 01:15:00,20220131230000,1,confirmado.com.ve,http://confirmado.com.ve/venezuela-recibe-un-m...,6,2067,2141,2098,0,20,2348,2.393617,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,
1117,1025804791,2022-01-31 01:15:00,20220131230000,1,confirmado.com.ve,http://confirmado.com.ve/venezuela-recibe-un-m...,6,2067,2122,2078,1,60,2348,2.393617,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,
1118,1025954046,2022-01-31 23:00:00,20220131230000,1,laverdad.com,http://www.laverdad.com/zulia/190687-abierta-c...,1,32,55,45,0,40,1368,2.242152,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,
1119,1025954047,2022-01-31 23:00:00,20220131230000,1,elnacional.com,https://www.elnacional.com/venezuela/denuncian...,2,107,154,118,1,100,2200,-7.242340,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,


### Merge Table

mentions_translation + mentions + export

In [20]:
def merge_table(export, mentions, mentions_translation):
    sub_mentions_translation = mentions_translation.loc[:,["mentions_translation_0", "mentions_translation_14"]]
    sub_mentions_translation["mentions_translation_14"] = sub_mentions_translation["mentions_translation_14"].apply(lambda x: x.split(";")[0].split(":")[1])
    
    mentions_mentions_translation = mentions.merge(sub_mentions_translation, left_on='mentions_0', right_on='mentions_translation_0', how='left')
    
    export_mentions_mentions_translation_joined = mentions_mentions_translation.merge(export, left_on="mentions_0", right_on="export_0", how='left')

    return export_mentions_mentions_translation_joined

export_mentions_mentions_translation_joined = merge_table(export, mentions, mentions_translation) 

export_mentions_mentions_translation_joined

Unnamed: 0,mentions_0,mentions_1,mentions_2,mentions_3,mentions_4,mentions_5,mentions_6,mentions_7,mentions_8,mentions_9,mentions_10,mentions_11,mentions_12,mentions_13,mentions_14,mentions_15,mentions_translation_0,mentions_translation_14,export_0,export_1,export_2,export_3,export_4,export_5,export_6,export_7,export_8,export_9,export_10,export_11,export_12,export_13,export_14,export_15,export_16,export_17,export_18,export_19,export_20,export_21,export_22,export_23,export_24,export_25,export_26,export_27,export_28,export_29,export_30,export_31,export_32,export_33,export_34,export_35,export_36,export_37,export_38,export_39,export_40,export_41,export_42,export_43,export_44,export_45,export_46,export_47,export_48,export_49,export_50,export_51,export_52,export_53,export_54,export_55,export_56,export_57,export_58,export_59,export_60
0,1025840133,2022-01-31 09:15:00,20220131103000,1,saharasamay.com,http://www.saharasamay.com/world-news/67663948...,14,2093,2119,2158,0,10,2924,-4.065041,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1025840134,2022-01-31 09:15:00,20220131103000,1,saharasamay.com,http://www.saharasamay.com/world-news/67663948...,14,2093,2119,2158,0,10,2924,-4.065041,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,967145097,2021-01-31 00:15:00,20220131103000,1,mirror.co.uk,https://www.mirror.co.uk/3am/celebrity-news/sp...,8,1861,-1,1876,0,40,2936,2.803738,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,967223391,2021-01-31 17:00:00,20220131103000,1,carolinajournal.com,https://www.carolinajournal.com/opinion-articl...,3,1734,-1,1771,1,10,4575,-4.605263,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,967177421,2021-01-31 08:00:00,20220131103000,1,phys.org,https://phys.org/news/2022-01-explores-tempera...,9,3569,3651,3619,1,100,6811,1.797040,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
510624,1025939176,2022-01-31 21:00:00,20220131230000,1,whbl.com,https://whbl.com/2022/01/31/south-africa-scrap...,1,75,106,123,1,60,560,-6.382979,,,,,1.025939e+09,2022-01-31,202201.0,2022.0,2022.0849,ZAF,JOHANNESBURG,ZAF,,,,,,,,AFR,AFRICA,AFR,,,,,,,,1.0,20.0,20.0,2.0,1.0,3.0,6.0,1.0,6.0,-6.382979,4.0,"Johannesburg, Gauteng, South Africa",SF,SF06,77364,-26.2000,28.083300,-1240261,4.0,"Johannesburg, Gauteng, South Africa",SF,SF06,77364,-26.2000,28.083300,-1240261,4.0,"Johannesburg, Gauteng, South Africa",SF,SF06,77364,-26.2000,28.083300,-1240261,2.022013e+13,https://wdsm710.com/2022/01/31/south-africa-sc...
510625,1025953612,2022-01-31 23:00:00,20220131230000,1,jordantimes.com,http://jordantimes.com/news/local/kings-vision...,10,1831,1877,1868,1,30,2907,6.318083,,,,,1.025954e+09,2022-01-31,202201.0,2022.0,2022.0849,ran,RANA,,,ran,,,,,,JOR,JORDAN,JOR,,,,,,,,0.0,10.0,10.0,1.0,1.0,0.0,3.0,1.0,3.0,6.318083,4.0,"Amman, (JO11), Jordan",JO,JO11,36728,31.9500,35.933300,-970362,4.0,"Amman, (JO11), Jordan",JO,JO11,36728,31.9500,35.933300,-970362,4.0,"Amman, (JO11), Jordan",JO,JO11,36728,31.9500,35.933300,-970362,2.022013e+13,http://jordantimes.com/news/local/kings-vision...
510626,1025953613,2022-01-31 23:00:00,20220131230000,1,msn.com,https://www.msn.com/en-xl/news/other/court-app...,3,1191,1232,1226,1,60,5413,-4.592423,,,,,1.025954e+09,2022-01-31,202201.0,2022.0,2022.0849,znd,AZERI,,,znd,,,,,,BUS,COMPANIES,,,,,,BUS,,,1.0,193.0,193.0,19.0,4.0,-10.0,6.0,1.0,6.0,-4.592423,4.0,"Azeri, Ida-Virumaa, Estonia",EN,EN03,15925,59.4506,26.867500,-2621534,4.0,"Glasgow, Glasgow City, United Kingdom",UK,UKV2,40176,55.8333,-4.250000,-2597039,4.0,"Glasgow, Glasgow City, United Kingdom",UK,UKV2,40176,55.8333,-4.250000,-2597039,2.022013e+13,https://www.msn.com/en-xl/news/other/court-app...
510627,1025953614,2022-01-31 23:00:00,20220131230000,1,msn.com,https://www.msn.com/en-xl/news/other/court-app...,3,1191,1206,1150,0,40,5413,-4.592423,,,,,1.025954e+09,2022-01-31,202201.0,2022.0,2022.0849,znd,AZERI,,,znd,,,,,,EST,ESTONIA,EST,,,,,,,,1.0,192.0,192.0,19.0,4.0,-9.5,4.0,1.0,4.0,-4.592423,4.0,"London, London, City of, United Kingdom",UK,UKH9,40110,51.5000,-0.116667,-2601889,4.0,"London, London, City of, United Kingdom",UK,UKH9,40110,51.5000,-0.116667,-2601889,4.0,"London, London, City of, United Kingdom",UK,UKH9,40110,51.5000,-0.116667,-2601889,2.022013e+13,https://www.msn.com/en-xl/news/other/court-app...


In [21]:
export_mentions_mentions_translation_joined["mentions_translation_14"].isnull().sum()

153787

## Requête 1 

In [22]:
def requete_1(export_mentions_mentions_translation_joined):
    requete1 = export_mentions_mentions_translation_joined.loc[:,["mentions_0", "mentions_1", "export_53", "mentions_translation_14"]]
    requete1['day'] = requete1["mentions_1"].dt.day
    requete1['month'] = requete1["mentions_1"].dt.month
    requete1['year'] = requete1["mentions_1"].dt.year

    requete1.rename(columns={"mentions_0" : "id_event",
                            "mentions_1" : "datetime",
                            "export_53" : "country_code",
                            "mentions_translation_14" : "source_langue"}, inplace=True)
    
    # Drop all rows with full NaN values
    col = requete1.columns.tolist()
    requete1 = requete1.dropna(subset=col, how='all')
    
    requete1.to_csv(r'C:/HUGO/Ecole/Telecom Paris/COURS/INF_728_Base_de_donnees_non_relationnelles/GDELT Project/requete1.csv', index=False)
    
    return requete1

requete1 = requete_1(export_mentions_mentions_translation_joined)

In [23]:
requete1

Unnamed: 0,id_event,datetime,country_code,source_langue,day,month,year
0,1025840133,2022-01-31 09:15:00,,,31,1,2022
1,1025840134,2022-01-31 09:15:00,,,31,1,2022
2,967145097,2021-01-31 00:15:00,,,31,1,2021
3,967223391,2021-01-31 17:00:00,,,31,1,2021
4,967177421,2021-01-31 08:00:00,,,31,1,2021
...,...,...,...,...,...,...,...
510624,1025939176,2022-01-31 21:00:00,SF,,31,1,2022
510625,1025953612,2022-01-31 23:00:00,JO,,31,1,2022
510626,1025953613,2022-01-31 23:00:00,UK,,31,1,2022
510627,1025953614,2022-01-31 23:00:00,UK,,31,1,2022


In [24]:
pd.DataFrame(requete1["source_langue"].value_counts())

Unnamed: 0,source_langue
spa,78987
rus,51114
fra,29042
deu,27746
por,21452
bul,20048
ita,17268
ara,13073
ell,12789
ron,9502


In [25]:
pd.DataFrame(requete1["country_code"].value_counts())

Unnamed: 0,country_code
US,43436
UK,14763
RS,10072
UP,6561
CA,4326
...,...
WI,1
YI,1
GL,1
TT,1


## Requête 2

In [26]:
def requete_2(export): 
    
    requete2 = export.loc[:,["export_0", "export_1", "export_53", "export_26"]]
    requete2['day'] = requete2["export_1"].dt.day
    requete2['month'] = requete2["export_1"].dt.month
    requete2['year'] = requete2["export_1"].dt.year

    requete2.rename(columns={"export_0" : "id_event",
                            "export_1" : "datetime",
                            "export_53" : "country_code",
                            "export_26" : "event_code"}, inplace=True)
    
    # Drop all rows with full NaN values
    col = requete2.columns.tolist()
    requete2 = requete2.dropna(subset=col, how='all')
    
    requete2.to_csv(r'C:/HUGO/Ecole/Telecom Paris/COURS/INF_728_Base_de_donnees_non_relationnelles/GDELT Project/requete2.csv', index=False)
    
    return requete2

In [27]:
requete_2 = requete_2(export)

In [28]:
requete_2

Unnamed: 0,id_event,datetime,country_code,event_code,day,month,year
0,1025848292,2021-01-31,US,20,31,1,2021
1,1025848293,2021-01-31,US,14,31,1,2021
2,1025848294,2021-01-31,US,14,31,1,2021
3,1025848295,2021-01-31,US,14,31,1,2021
4,1025848296,2021-01-31,US,14,31,1,2021
...,...,...,...,...,...,...,...
997,1025953611,2022-01-31,YM,51,31,1,2022
998,1025953612,2022-01-31,JO,10,31,1,2022
999,1025953613,2022-01-31,UK,193,31,1,2022
1000,1025953614,2022-01-31,UK,192,31,1,2022


## Requête 3

In [29]:
def requete_3(gkg):    
    requete3 = gkg.loc[:,["gkg_0","gkg_1", "gkg_3", "gkg_7", "gkg_11", "gkg_9", "gkg_15"]]

    requete3['day'] = requete3["gkg_1"].dt.day
    requete3['month'] = requete3["gkg_1"].dt.month
    requete3['year'] = requete3["gkg_1"].dt.year

    requete3.rename(columns={"gkg_0" : "id_gkg",
                             "gkg_1" : "datetime",
                            "gkg_3" : "source_domain",
                            "gkg_7" : "themes",
                            "gkg_11" : "persons", 
                            "gkg_9" : "locations",
                            "gkg_15" : "avg_tone"}, inplace=True)


    requete3["locations"] = requete3["locations"].apply(lambda x : str(x).split(",")[0].split("#")[-1])
    requete3["avg_tone"] = requete3["avg_tone"].apply(lambda x : float(str(x).split(",")[0]))
    
    # Drop all rows with full NaN values
    col = requete3.columns.tolist()
    requete2 = requete3.dropna(subset=col, how='all')

    requete3.to_csv(r'C:/HUGO/Ecole/Telecom Paris/COURS/INF_728_Base_de_donnees_non_relationnelles/GDELT Project/requete3.csv', index=False)

    return requete3

In [30]:
requete3 = requete_3(gkg)

In [31]:
requete3

Unnamed: 0,id_gkg,datetime,source_domain,themes,persons,locations,avg_tone,day,month,year
0,20220131103000-0,2022-01-31 10:30:00,cbc.ca,REL_ANTISEMITISM;MEDIA_MSM;PROTEST;URBAN;TAX_F...,amarjeet sohi;michael cooper;cathy heron;duane...,Ottawa,-4.061896,31,1,2022
1,20220131103000-1,2022-01-31 10:30:00,marketwatch.com,GENERAL_HEALTH;MEDICAL;USPEC_POLICY1;EPU_UNCER...,kyle morris,,-1.388889,31,1,2022
2,20220131103000-2,2022-01-31 10:30:00,familylawweek.co.uk,USPEC_POLITICS_GENERAL1;WB_696_PUBLIC_SECTOR_M...,,,2.707276,31,1,2022
3,20220131103000-3,2022-01-31 10:30:00,somersetcountygazette.co.uk,KILL;SOC_GENERALCRIME;TRIAL;WB_2433_CONFLICT_A...,arthur noyer,Pont-De-Beauvoisin,-6.811146,31,1,2022
4,20220131103000-4,2022-01-31 10:30:00,citizen-times.com,TAX_ECON_PRICE;TAX_FNCACT;TAX_FNCACT_CITIZEN;G...,patrick bowen;john boyle asheville,Miami,-0.235294,31,1,2022
...,...,...,...,...,...,...,...,...,...,...
1630,20220131230000-1630,2022-01-31 23:00:00,palmbeachpost.com,UNGP_FORESTS_RIVERS_OCEANS;EPU_CATS_MIGRATION_...,louise pearson;kathy hillard dimpflmaier;lisa ...,Miami,-1.047120,31,1,2022
1631,20220131230000-1631,2022-01-31 23:00:00,cosmopolitan.com,TAX_DISEASE;TAX_DISEASE_CANCER;WB_1406_DISEASE...,,,-1.438849,31,1,2022
1632,20220131230000-1632,2022-01-31 23:00:00,yahoo.com,EPU_CATS_REGULATION;WB_1921_PRIVATE_SECTOR_DEV...,tobias koppers;yehuda katz;stefan penner;tom d...,,-1.147932,31,1,2022
1633,20220131230000-1633,2022-01-31 23:00:00,iheart.com,NATURAL_DISASTER;NATURAL_DISASTER_HEAVY_SNOW;N...,,,-2.962963,31,1,2022


In [32]:
gkg_translation

Unnamed: 0,gkg_translation_0,gkg_translation_1,gkg_translation_2,gkg_translation_3,gkg_translation_4,gkg_translation_5,gkg_translation_6,gkg_translation_7,gkg_translation_8,gkg_translation_9,gkg_translation_10,gkg_translation_11,gkg_translation_12,gkg_translation_13,gkg_translation_14,gkg_translation_15,gkg_translation_16,gkg_translation_17,gkg_translation_18,gkg_translation_19,gkg_translation_20,gkg_translation_21,gkg_translation_22,gkg_translation_23,gkg_translation_24,gkg_translation_25,gkg_translation_26
0,20220131103000-T0,2022-01-31 10:30:00,1.0,sport.aktualne.cz,https://sport.aktualne.cz/fotbal/ceska-liga/sl...,,,,,"1##RB#RB###RB;4#Brussels, Bruxelles-Capitale, ...","4#Brussels, Bruxelles-Capitale, Belgium#BE#BE1...",rekonvalescenti hovorka,"Rekonvalescenti Hovorka,168",league slavia,"League Slavia,21","0,1.44404332129964,1.44404332129964,2.88808664...",,"wc:244,c1.2:3,c1.3:1,c1.4:1,c12.1:11,c12.10:26...",https://cdn.xsd.cz/original/e6fb98e9bafc3849bb...,,,,,"League Slavia,22;Rekonvalescenti Hovorka,176;S...","4,quarter slowed down On,1341;",srclc:ces;eng:Moses 2.1.1 / MosesCore Europarl...,<PAGE_TITLE>Slavia v gener&#xE1;lce na ligu z&...
1,20220131103000-T1,2022-01-31 10:30:00,1.0,sport.cz,https://www.sport.cz/clanek/olympiada-zoh-2022...,,,,,"4#Liberec, LibereckÃ½, Czech Republic#EZ#EZ83#...",1#Russia#RS#RS##60#100#RS#418;1#Switzerland#SZ...,sochi hadamczik;peter forsbergovi,"Sochi Hadamczik,2676;Peter Forsbergovi,1007",olympics,"Olympics,2442;Olympics,2588;Olympics,3084;Olym...","0.820707070707071,2.52525252525253,1.704545454...",1#0#0#2012#2979;1#0#0#1971#3289;1#0#0#1994#339...,"wc:1474,c12.1:146,c12.10:148,c12.12:42,c12.13:...",https://d16-a.sdn.cz/d_16/c_img_QP_Y/h7bBBs.jp...,https://d16-a.sdn.cz/d_16/c_img_QO_X/liqaf.jpe...,,,,"Peter Forsbergovi,1039;Vancouverem Could,1191;...","40,battle with Switzerland,254;10,takes agains...",srclc:ces;eng:Moses 2.1.1 / MosesCore Europarl...,<PAGE_LINKS>http://www.pravo.cz</PAGE_LINKS><P...
2,20220131103000-T2,2022-01-31 10:30:00,1.0,ceskenoviny.cz,https://www.ceskenoviny.cz/zpravy/rust-ekonomi...,,,USPEC_POLICY1;EPU_ECONOMY;EPU_ECONOMY_HISTORIC...,"TAX_ETHNICITY_CZECH,1673;TAX_WORLDLANGUAGES_CZ...",1#Germany#GM#GM#51.5#10.5#GM;1#Latvia#LG#LG#57...,1#Czech Republic#EZ#EZ##49.75#15#EZ#1682;1#Lat...,,,,,"-1.34228187919463,0,1.34228187919463,1.3422818...",,"wc:287,c1.2:7,c12.1:7,c12.10:19,c12.12:13,c12....",https://i3.cn.cz/14/1642583735_P2022011903260.jpg,,,https://youtube.com/user/CTKvideozpravy/feed;,,"Against The,1210;Czech Republic,1716","4,quarter slowed down on,32;4,quarter rose aga...",srclc:ces;eng:Moses 2.1.1 / MosesCore Europarl...,<PAGE_AUTHORS>&#x10C;TK</PAGE_AUTHORS><PAGE_TI...
3,20220131103000-T3,2022-01-31 10:30:00,1.0,sedmicka.tyden.cz,https://sedmicka.tyden.cz/rubriky/souteze/sout...,,,WB_678_DIGITAL_GOVERNMENT;WB_694_BROADCAST_AND...,"BAN,231;WB_678_DIGITAL_GOVERNMENT,72;WB_678_DI...",,,,,,,"0,1.76991150442478,1.76991150442478,3.53982300...",,"wc:104,c12.1:4,c12.10:10,c12.12:3,c12.13:2,c12...",https://www.tyden.cz/obrazek/202201/61f783b16e...,,,,,,,srclc:ces;eng:Moses 2.1.1 / MosesCore Europarl...,<PAGE_LINKS>https://sedmicka.tyden.cz/rubriky/...
4,20220131103000-T4,2022-01-31 10:30:00,1.0,tyden.cz,https://www.tyden.cz/rubriky/zdravi/invazivni-...,,,GENERAL_HEALTH;TAX_DISEASE;TAX_DISEASE_DISEASE...,"TAX_DISEASE_INFECTION,58;BAN,262;TAX_WORLDMAMM...",,,,,,,"-4.47761194029851,0.746268656716418,5.22388059...",,"wc:129,c12.1:8,c12.10:14,c12.12:6,c12.13:2,c12...",https://www.tyden.cz/obrazek/202201/61f7ab88b4...,,,,,,,srclc:ces;eng:Moses 2.1.1 / MosesCore Europarl...,<PAGE_LINKS>http://www.sabre.cz;http://www.vsh...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1408,20220131230000-T1408,2022-01-31 23:00:00,1.0,vietgiaitri.com,https://vietgiaitri.com/smartphone-man-hinh-ga...,"AFFECT#2000000000##1#Vietnam, Republic Of#VM#V...","AFFECT#2000000000##1#Vietnam, Republic Of#VM#V...",TAX_ECON_PRICE;TAX_FNCACT;TAX_FNCACT_CHILDREN;...,"IDEOLOGY,5431;MEDIA_MSM,3219;TAX_ECON_PRICE,78...","1#Vietnam, Republic Of#VM#VM#16.166667#107.833...",1#Vietnam#VM#VM##16.166667#107.833333#VM#1169,,,,,"1.17860380779692,4.3517679057117,3.17316409791...",,"wc:1070,nwc:1280,c1.1:4,c1.3:2,c12.1:111,c12.1...",https://i.vietgiaitri.com/2022/2/1/smartphone-...,https://i.vietgiaitri.com/2022/1/28/cu-1-ngay-...,,https://youtube.com/c/VGTTV?sub_confirmation=1...,,"Pictured Phone,459;Fresh New,2622;Apple New,28...","9,quality brought a cover,2124;1000000000,USD ...",srclc:vie;eng:GT-VIE 1.0,<PAGE_LINKS>https://vietgiaitri.com/galaxy-z-f...
1409,20220131230000-T1409,2022-01-31 23:00:00,1.0,vietgiaitri.com,https://vietgiaitri.com/the-gioi-da-ghi-nhan-t...,,,CRISISLEX_CRISISLEXREC;CRISISLEX_C03_WELLBEING...,"GENERAL_GOVERNMENT,2283;EPU_POLICY_GOVERNMENT,...","1#Vietnam, Republic Of#VM#VM#16.166667#107.833...",1#India#IN#IN##20#77#IN#1674;1#India#IN#IN##20...,europe europe;europe asia,"Europe Europe,822;Europe Europe,978;Europe Asi...",health india,"Health India,1892","-2.68817204301075,2.01612903225806,4.704301075...",,"wc:734,nwc:933,c1.3:1,c12.1:46,c12.10:106,c12....",https://t.vietgiaitri.com/2022/2/1/the-gioi-da...,https://i.vietgiaitri.com/2022/1/17/gioi-chuc-...,,https://youtube.com/c/VGTTV?sub_confirmation=1...,,"World Under,49;Pictured Staff,311;Hospital Pic...","1000000,World Under p statistics,32;1000000,th...",srclc:vie;eng:GT-VIE 1.0,<PAGE_LINKS>https://vietgiaitri.com/benh-nhan-...
1410,20220131230000-T1410,2022-01-31 23:00:00,1.0,vietgiaitri.com,https://vietgiaitri.com/thuong-lai-dam-nuoc-ma...,WOUND#2000##0######;CRISISLEX_CRISISLEXREC#200...,WOUND#2000##0#######0;CRISISLEX_CRISISLEXREC#2...,WOUND;CRISISLEX_CRISISLEXREC;CRISISLEX_C03_WEL...,"TAX_ECON_PRICE,511;TAX_ECON_PRICE,1077;TAX_ECO...",,,,,english campaign,"English Campaign,1859","1.59340659340659,6.37362637362637,4.7802197802...",,"wc:1799,nwc:1834,c1.1:4,c1.3:1,c12.1:192,c12.1...",https://i.vietgiaitri.com/2022/2/1/thuong-lai-...,https://i.vietgiaitri.com/2022/1/31/giao-thua-...,,https://youtube.com/c/VGTTV?sub_confirmation=1...,,"Injured Drive,14;Flower New Year,67;Life Noon,...","2,English Campaign s traders,1558;1000000,copp...",srclc:vie;eng:GT-VIE 1.0,<PAGE_LINKS>https://vietgiaitri.com/ban-hoa-ke...
1411,20220131230000-T1411,2022-01-31 23:00:00,1.0,vietgiaitri.com,https://vietgiaitri.com/thu-tuong-nhanh-chong-...,,,TAX_FNCACT;TAX_FNCACT_MINISTER;LEADER;TAX_FNCA...,"TAX_FNCACT_GUIDE,1083;TAX_FNCACT_CHILD,1904;TA...","4#Hanoi, Ha N?I, Vietnam, Republic Of#VM#VM44#...",1#Reunion#RE#RE##-21.1#55.6#RE#2973;1#Reunion#...,,,,,"3.31534309946029,6.32228218966847,3.0069390902...",,"wc:1275,nwc:1255,c1.1:1,c1.2:3,c12.1:104,c12.1...",https://t.vietgiaitri.com/2022/2/1/thu-tuong-n...,https://i.vietgiaitri.com/2022/1/22/chu-tich-n...,,https://youtube.com/c/VGTTV?sub_confirmation=1...,,"Prime Minister Fast,20;New Year Life Here,88;P...","1000000,people learning work,3915;1000000000,U...",srclc:vie;eng:GT-VIE 1.0,<PAGE_LINKS>https://vietgiaitri.com/bo-giao-th...


## Requête 4

In [33]:
def requete_4(gkg):
    
    requete4 = gkg_translation.loc[:,["gkg_translation_0", "gkg_translation_7", "gkg_translation_11", "gkg_translation_9", "gkg_translation_15", "gkg_translation_25"]]

    requete4.rename(columns={"gkg_translation_0" : "id_gkg_translation",
                             "gkg_translation_7" : "themes",
                            "gkg_translation_11" : "persons", 
                            "gkg_translation_9" : "locations",
                            "gkg_translation_15" : "avg_tone",
                            "gkg_translation_25" : "source_langue"}, inplace=True)


    requete4["locations"] = requete4["locations"].apply(lambda x : str(x).split(",")[0].split("#")[-1])
    requete4 = requete4.replace(to_replace='None', value=np.nan).dropna()
    requete4["avg_tone"] = requete4["avg_tone"].apply(lambda x : float(str(x).split(",")[0]))
    
    # Drop all rows with full NaN values
    col = requete4.columns.tolist()
    requete4 = requete4.dropna(subset=col, how='all')
    
    requete4.to_csv(r'C:/HUGO/Ecole/Telecom Paris/COURS/INF_728_Base_de_donnees_non_relationnelles/GDELT Project/requete4.csv', index=False)


    return requete4

In [34]:
requete4 = requete_4(gkg)

In [35]:
requete4

Unnamed: 0,id_gkg_translation,themes,persons,locations,avg_tone,source_langue
9,20220131103000-T9,LEADER;TAX_FNCACT;TAX_FNCACT_PRESIDENT;USPEC_P...,abu dhabi;isaac herzog;herzog michal;sheikh mo...,IS,2.222222,srclc:fra;eng:Moses 2.1.1 / MosesCore Europarl...
10,20220131103000-T10,ARMEDCONFLICT;TAX_FNCACT;TAX_FNCACT_MINISTER;T...,eric felley ueli maurer;ueli maurer,,-2.303263,srclc:fra;eng:Moses 2.1.1 / MosesCore Europarl...
11,20220131103000-T11,SOC_GENERALCRIME;TAX_FNCACT;TAX_FNCACT_VICTIM;...,nordhal lelandais;corporal arthur;nordahl lela...,,-11.250000,srclc:fra;eng:Moses 2.1.1 / MosesCore Europarl...
12,20220131103000-T12,TERROR;WB_2433_CONFLICT_AND_VIOLENCE;WB_2451_R...,abu dhabi,IS,-4.918033,srclc:fra;eng:Moses 2.1.1 / MosesCore Europarl...
16,20220131103000-T16,URBAN;KILL;CRISISLEX_T03_DEAD;TAX_FNCACT;TAX_F...,glairons georges-sadoul,,-6.666667,srclc:fra;eng:Moses 2.1.1 / MosesCore Europarl...
...,...,...,...,...,...,...
1379,20220131230000-T1379,LEADER;,buster karno,Senayan,-7.291667,srclc:ind;eng:GT-IND 1.0
1381,20220131230000-T1381,TAX_FNCACT;TAX_FNCACT_POLICE;EDUCATION;,sabang merauke,East Java,-7.027027,srclc:ind;eng:GT-IND 1.0
1400,20220131230000-T1400,WB_137_WATER;TAX_ETHNICITY;TAX_ETHNICITY_CHINE...,europe asia,CH,-0.912863,srclc:vie;eng:GT-VIE 1.0
1401,20220131230000-T1401,TAX_FNCACT;TAX_FNCACT_MINISTER;LEADER;TAX_FNCA...,europe europe,Hanoi,0.372024,srclc:vie;eng:GT-VIE 1.0
